Blob DB: avoid having a separate read of checksum

Summary:
Previously on a blob db read, we are making a read of the blob value, and then make another read to get CRC checksum. I'm combining the two read into one.

readrandom db_bench with 1G database with base db size of 13M, value size 1k:
`./db_bench --db=/home/yiwu/tmp/db_bench --use_blob_db --value_size=1024 --num=1000000 --benchmarks=readrandom --use_existing_db --cache_size=32000000`
master: throughput 234MB/s, get micros p50 5.984 p95 9.998 p99 20.817 p100 787
this PR: throughput 261MB/s, get micros p50 5.157 p95 9.928 p99 20.724 p100 190
Closes https://github.com/facebook/rocksdb/pull/3301

Differential Revision: D6615950

Pulled By: yiwu-arbug

fbshipit-source-id: 052410c6d8539ec0cc305d53793bbc8f3616baa3
This commit is contained in:
Yi Wu 2018-01-05 16:35:49 -08:00 committed by Facebook Github Bot
parent 3e955fad09
commit 30a017feca

View File

@ -1055,58 +1055,58 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
std::shared_ptr<RandomAccessFileReader> reader =
GetOrOpenRandomAccessReader(bfile, env_, env_options_);
std::string* valueptr = value->GetSelf();
std::string value_c;
if (bdb_options_.compression != kNoCompression) {
valueptr = &value_c;
}
assert(blob_index.offset() > key.size() + sizeof(uint32_t));
uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
// Allocate the buffer. This is safe in C++11
// Note that std::string::reserved() does not work, since previous value
// of the buffer can be larger than blob_index.size().
valueptr->resize(blob_index.size());
char* buffer = &(*valueptr)[0];
std::string buffer_str(record_size, static_cast<char>(0));
char* buffer = &buffer_str[0];
Slice blob_value;
// A partial blob record contain checksum, key and value.
Slice blob_record;
{
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value,
buffer);
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size());
s = reader->Read(record_offset, record_size, &blob_record, buffer);
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
}
if (!s.ok() || blob_value.size() != blob_index.size()) {
if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to read blob from file: %s blob_offset: %" PRIu64
" blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
bfile->PathName().c_str(), blob_index.offset(),
blob_index.size(), static_cast<int>(blob_value.size()),
key.data(), s.ToString().c_str());
}
return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
if (!s.ok()) {
ROCKS_LOG_DEBUG(db_options_.info_log,
"Failed to read blob from blob file %" PRIu64
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
", key_size: " PRIu64 ", read " PRIu64
"bytes, status: '%s'",
bfile->BlobFileNumber(), blob_index.offset(),
blob_index.size(), key.size(), s.ToString().c_str());
return s;
}
if (blob_record.size() != record_size) {
ROCKS_LOG_DEBUG(db_options_.info_log,
"Failed to read blob from blob file %" PRIu64
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
", key_size: " PRIu64 ", read " PRIu64
"bytes, status: '%s'",
bfile->BlobFileNumber(), blob_index.offset(),
blob_index.size(), key.size(), s.ToString().c_str());
// TODO(yiwu): Add an option to skip crc checking.
Slice crc_slice;
return Status::Corruption("Failed to retrieve blob from blob index.");
}
Slice crc_slice(blob_record.data(), sizeof(uint32_t));
Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
blob_index.size());
uint32_t crc_exp;
std::string crc_str;
crc_str.resize(sizeof(uint32_t));
char* crc_buffer = &(crc_str[0]);
s = reader->Read(blob_index.offset() - (key.size() + sizeof(uint32_t)),
sizeof(uint32_t), &crc_slice, crc_buffer);
if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to fetch blob crc file: %s blob_offset: %" PRIu64
" blob_size: %" PRIu64 " key: %s status: '%s'",
bfile->PathName().c_str(), blob_index.offset(),
blob_index.size(), key.data(), s.ToString().c_str());
}
return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
if (!GetFixed32(&crc_slice, &crc_exp)) {
ROCKS_LOG_DEBUG(db_options_.info_log,
"Unable to decode CRC from blob file %" PRIu64
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
", key size: %" PRIu64 ", status: '%s'",
bfile->BlobFileNumber(), blob_index.offset(),
blob_index.size(), key.size(), s.ToString().c_str());
return Status::Corruption("Unable to decode checksum.");
}
uint32_t crc = crc32c::Value(key.data(), key.size());
crc = crc32c::Extend(crc, blob_value.data(), blob_value.size());
uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t),
blob_record.size() - sizeof(uint32_t));
crc = crc32c::Mask(crc); // Adjust for storage
if (crc != crc_exp) {
if (debug_level_ >= 2) {
@ -1119,7 +1119,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
return Status::Corruption("Corruption. Blob CRC mismatch");
}
if (bfile->compression() != kNoCompression) {
if (bfile->compression() == kNoCompression) {
value->PinSelf(blob_value);
} else {
BlockContents contents;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
{
@ -1130,11 +1132,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
*(cfh->cfd()->ioptions()));
}
*(value->GetSelf()) = contents.data.ToString();
value->PinSelf(contents.data);
}
value->PinSelf();
return s;
}