Blob DB: avoid having a separate read of checksum

Summary:
Previously on a blob db read, we are making a read of the blob value, and then make another read to get CRC checksum. I'm combining the two read into one.

readrandom db_bench with 1G database with base db size of 13M, value size 1k:
`./db_bench --db=/home/yiwu/tmp/db_bench --use_blob_db --value_size=1024 --num=1000000 --benchmarks=readrandom --use_existing_db --cache_size=32000000`
master: throughput 234MB/s, get micros p50 5.984 p95 9.998 p99 20.817 p100 787
this PR: throughput 261MB/s, get micros p50 5.157 p95 9.928 p99 20.724 p100 190
Closes https://github.com/facebook/rocksdb/pull/3301

Differential Revision: D6615950

Pulled By: yiwu-arbug

fbshipit-source-id: 052410c6d8539ec0cc305d53793bbc8f3616baa3
This commit is contained in:
Yi Wu 2018-01-05 16:35:49 -08:00
parent af9746fd22
commit 6d4ef9e839

View File

@ -1055,58 +1055,58 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
std::shared_ptr<RandomAccessFileReader> reader = std::shared_ptr<RandomAccessFileReader> reader =
GetOrOpenRandomAccessReader(bfile, env_, env_options_); GetOrOpenRandomAccessReader(bfile, env_, env_options_);
std::string* valueptr = value->GetSelf(); assert(blob_index.offset() > key.size() + sizeof(uint32_t));
std::string value_c; uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
if (bdb_options_.compression != kNoCompression) { uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
valueptr = &value_c;
}
// Allocate the buffer. This is safe in C++11 // Allocate the buffer. This is safe in C++11
// Note that std::string::reserved() does not work, since previous value std::string buffer_str(record_size, static_cast<char>(0));
// of the buffer can be larger than blob_index.size(). char* buffer = &buffer_str[0];
valueptr->resize(blob_index.size());
char* buffer = &(*valueptr)[0];
Slice blob_value; // A partial blob record contain checksum, key and value.
Slice blob_record;
{ {
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, s = reader->Read(record_offset, record_size, &blob_record, buffer);
buffer); RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size());
} }
if (!s.ok() || blob_value.size() != blob_index.size()) { if (!s.ok()) {
if (debug_level_ >= 2) { ROCKS_LOG_DEBUG(db_options_.info_log,
ROCKS_LOG_ERROR(db_options_.info_log, "Failed to read blob from blob file %" PRIu64
"Failed to read blob from file: %s blob_offset: %" PRIu64 ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
" blob_size: %" PRIu64 " read: %d key: %s status: '%s'", ", key_size: " PRIu64 ", read " PRIu64
bfile->PathName().c_str(), blob_index.offset(), "bytes, status: '%s'",
blob_index.size(), static_cast<int>(blob_value.size()), bfile->BlobFileNumber(), blob_index.offset(),
key.data(), s.ToString().c_str()); blob_index.size(), key.size(), s.ToString().c_str());
} return s;
return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
} }
if (blob_record.size() != record_size) {
ROCKS_LOG_DEBUG(db_options_.info_log,
"Failed to read blob from blob file %" PRIu64
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
", key_size: " PRIu64 ", read " PRIu64
"bytes, status: '%s'",
bfile->BlobFileNumber(), blob_index.offset(),
blob_index.size(), key.size(), s.ToString().c_str());
// TODO(yiwu): Add an option to skip crc checking. return Status::Corruption("Failed to retrieve blob from blob index.");
Slice crc_slice; }
Slice crc_slice(blob_record.data(), sizeof(uint32_t));
Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
blob_index.size());
uint32_t crc_exp; uint32_t crc_exp;
std::string crc_str; if (!GetFixed32(&crc_slice, &crc_exp)) {
crc_str.resize(sizeof(uint32_t)); ROCKS_LOG_DEBUG(db_options_.info_log,
char* crc_buffer = &(crc_str[0]); "Unable to decode CRC from blob file %" PRIu64
s = reader->Read(blob_index.offset() - (key.size() + sizeof(uint32_t)), ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
sizeof(uint32_t), &crc_slice, crc_buffer); ", key size: %" PRIu64 ", status: '%s'",
if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) { bfile->BlobFileNumber(), blob_index.offset(),
if (debug_level_ >= 2) { blob_index.size(), key.size(), s.ToString().c_str());
ROCKS_LOG_ERROR(db_options_.info_log, return Status::Corruption("Unable to decode checksum.");
"Failed to fetch blob crc file: %s blob_offset: %" PRIu64
" blob_size: %" PRIu64 " key: %s status: '%s'",
bfile->PathName().c_str(), blob_index.offset(),
blob_index.size(), key.data(), s.ToString().c_str());
}
return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
} }
uint32_t crc = crc32c::Value(key.data(), key.size()); uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t),
crc = crc32c::Extend(crc, blob_value.data(), blob_value.size()); blob_record.size() - sizeof(uint32_t));
crc = crc32c::Mask(crc); // Adjust for storage crc = crc32c::Mask(crc); // Adjust for storage
if (crc != crc_exp) { if (crc != crc_exp) {
if (debug_level_ >= 2) { if (debug_level_ >= 2) {
@ -1119,7 +1119,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
return Status::Corruption("Corruption. Blob CRC mismatch"); return Status::Corruption("Corruption. Blob CRC mismatch");
} }
if (bfile->compression() != kNoCompression) { if (bfile->compression() == kNoCompression) {
value->PinSelf(blob_value);
} else {
BlockContents contents; BlockContents contents;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily()); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
{ {
@ -1130,11 +1132,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
kBlockBasedTableVersionFormat, Slice(), bfile->compression(), kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
*(cfh->cfd()->ioptions())); *(cfh->cfd()->ioptions()));
} }
*(value->GetSelf()) = contents.data.ToString(); value->PinSelf(contents.data);
} }
value->PinSelf();
return s; return s;
} }