2013-10-16 23:59:46 +02:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "table/format.h"
|
|
|
|
|
2014-02-05 01:21:47 +01:00
|
|
|
#include <string>
|
2014-02-08 04:26:49 +01:00
|
|
|
#include <inttypes.h>
|
2014-02-05 01:21:47 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "port/port.h"
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
#include "rocksdb/env.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/block.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/crc32c.h"
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
#include "util/perf_context_imp.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
void BlockHandle::EncodeTo(std::string* dst) const {
|
|
|
|
// Sanity check that all fields have been set
|
|
|
|
assert(offset_ != ~static_cast<uint64_t>(0));
|
|
|
|
assert(size_ != ~static_cast<uint64_t>(0));
|
|
|
|
PutVarint64(dst, offset_);
|
|
|
|
PutVarint64(dst, size_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockHandle::DecodeFrom(Slice* input) {
|
|
|
|
if (GetVarint64(input, &offset_) &&
|
|
|
|
GetVarint64(input, &size_)) {
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
return Status::Corruption("bad block handle");
|
|
|
|
}
|
|
|
|
}
|
2013-12-05 00:43:09 +01:00
|
|
|
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
void Footer::EncodeTo(std::string* dst) const {
|
|
|
|
#ifndef NDEBUG
|
|
|
|
const size_t original_size = dst->size();
|
|
|
|
#endif
|
|
|
|
metaindex_handle_.EncodeTo(dst);
|
|
|
|
index_handle_.EncodeTo(dst);
|
|
|
|
dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding
|
2014-02-05 01:21:47 +01:00
|
|
|
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
|
|
|
|
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(dst->size() == original_size + kEncodedLength);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status Footer::DecodeFrom(Slice* input) {
|
2013-03-01 03:04:58 +01:00
|
|
|
assert(input != nullptr);
|
2013-01-09 19:44:30 +01:00
|
|
|
assert(input->size() >= kEncodedLength);
|
|
|
|
|
2014-02-05 01:21:47 +01:00
|
|
|
const char* magic_ptr =
|
|
|
|
input->data() + kEncodedLength - kMagicNumberLengthByte;
|
2011-03-18 23:37:00 +01:00
|
|
|
const uint32_t magic_lo = DecodeFixed32(magic_ptr);
|
|
|
|
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
|
|
|
|
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
|
|
|
|
(static_cast<uint64_t>(magic_lo)));
|
2014-02-05 01:21:47 +01:00
|
|
|
if (HasInitializedTableMagicNumber()) {
|
|
|
|
if (magic != table_magic_number()) {
|
|
|
|
char buffer[80];
|
|
|
|
snprintf(buffer, sizeof(buffer) - 1,
|
2014-02-08 04:47:48 +01:00
|
|
|
"not an sstable (bad magic number --- %lx)",
|
|
|
|
(long)magic);
|
2014-02-05 01:21:47 +01:00
|
|
|
return Status::InvalidArgument(buffer);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
set_table_magic_number(magic);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
Status result = metaindex_handle_.DecodeFrom(input);
|
|
|
|
if (result.ok()) {
|
|
|
|
result = index_handle_.DecodeFrom(input);
|
|
|
|
}
|
|
|
|
if (result.ok()) {
|
|
|
|
// We skip over any leftover data (just padding for now) in "input"
|
|
|
|
const char* end = magic_ptr + 8;
|
|
|
|
*input = Slice(end, input->data() + input->size() - end);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2013-12-05 01:35:48 +01:00
|
|
|
Status ReadFooterFromFile(RandomAccessFile* file,
|
|
|
|
uint64_t file_size,
|
|
|
|
Footer* footer) {
|
|
|
|
if (file_size < Footer::kEncodedLength) {
|
|
|
|
return Status::InvalidArgument("file is too short to be an sstable");
|
|
|
|
}
|
|
|
|
|
|
|
|
char footer_space[Footer::kEncodedLength];
|
|
|
|
Slice footer_input;
|
|
|
|
Status s = file->Read(file_size - Footer::kEncodedLength,
|
|
|
|
Footer::kEncodedLength,
|
|
|
|
&footer_input,
|
|
|
|
footer_space);
|
|
|
|
if (!s.ok()) return s;
|
|
|
|
|
|
|
|
// Check that we actually read the whole footer from the file. It may be
|
|
|
|
// that size isn't correct.
|
|
|
|
if (footer_input.size() != Footer::kEncodedLength) {
|
|
|
|
return Status::InvalidArgument("file is too short to be an sstable");
|
|
|
|
}
|
|
|
|
|
|
|
|
return footer->DecodeFrom(&footer_input);
|
|
|
|
}
|
|
|
|
|
2013-04-23 08:47:56 +02:00
|
|
|
Status ReadBlockContents(RandomAccessFile* file,
|
|
|
|
const ReadOptions& options,
|
|
|
|
const BlockHandle& handle,
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
BlockContents* result,
|
2013-09-02 08:23:40 +02:00
|
|
|
Env* env,
|
|
|
|
bool do_uncompress) {
|
2012-04-17 17:36:46 +02:00
|
|
|
result->data = Slice();
|
|
|
|
result->cachable = false;
|
|
|
|
result->heap_allocated = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Read the block contents as well as the type/crc footer.
|
|
|
|
// See table_builder.cc for the code that built this structure.
|
2011-04-21 00:48:11 +02:00
|
|
|
size_t n = static_cast<size_t>(handle.size());
|
2011-03-18 23:37:00 +01:00
|
|
|
char* buf = new char[n + kBlockTrailerSize];
|
|
|
|
Slice contents;
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_AUTO(block_read_time);
|
2011-03-18 23:37:00 +01:00
|
|
|
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_MEASURE(block_read_time);
|
|
|
|
PERF_COUNTER_ADD(block_read_count, 1);
|
|
|
|
PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
delete[] buf;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (contents.size() != n + kBlockTrailerSize) {
|
|
|
|
delete[] buf;
|
|
|
|
return Status::Corruption("truncated block read");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check the crc of the type and the block contents
|
|
|
|
const char* data = contents.data(); // Pointer to where Read put the data
|
|
|
|
if (options.verify_checksums) {
|
|
|
|
const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
|
|
|
|
const uint32_t actual = crc32c::Value(data, n + 1);
|
|
|
|
if (actual != crc) {
|
|
|
|
delete[] buf;
|
|
|
|
s = Status::Corruption("block checksum mismatch");
|
|
|
|
return s;
|
|
|
|
}
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_MEASURE(block_checksum_time);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2013-09-02 08:23:40 +02:00
|
|
|
// If the caller has requested that the block not be uncompressed
|
|
|
|
if (!do_uncompress || data[n] == kNoCompression) {
|
|
|
|
if (data != buf) {
|
|
|
|
// File implementation gave us pointer to some other data.
|
|
|
|
// Use it directly under the assumption that it will be live
|
|
|
|
// while the file is open.
|
|
|
|
delete[] buf;
|
|
|
|
result->data = Slice(data, n);
|
|
|
|
result->heap_allocated = false;
|
|
|
|
result->cachable = false; // Do not double-cache
|
|
|
|
} else {
|
|
|
|
result->data = Slice(buf, n);
|
|
|
|
result->heap_allocated = true;
|
|
|
|
result->cachable = true;
|
|
|
|
}
|
|
|
|
result->compression_type = (rocksdb::CompressionType)data[n];
|
|
|
|
s = Status::OK();
|
|
|
|
} else {
|
2013-11-06 23:16:22 +01:00
|
|
|
s = UncompressBlockContents(data, n, result);
|
2013-09-02 08:23:40 +02:00
|
|
|
delete[] buf;
|
|
|
|
}
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_STOP(block_decompress_time);
|
2013-09-02 08:23:40 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// The 'data' points to the raw block contents that was read in from file.
|
|
|
|
// This method allocates a new heap buffer and the raw block
|
|
|
|
// contents are uncompresed into this buffer. This
|
|
|
|
// buffer is returned via 'result' and it is upto the caller to
|
|
|
|
// free this buffer.
|
|
|
|
Status UncompressBlockContents(const char* data, size_t n,
|
|
|
|
BlockContents* result) {
|
2013-03-01 03:04:58 +01:00
|
|
|
char* ubuf = nullptr;
|
2012-06-29 04:26:43 +02:00
|
|
|
int decompress_size = 0;
|
2013-09-02 08:23:40 +02:00
|
|
|
assert(data[n] != kNoCompression);
|
2011-03-18 23:37:00 +01:00
|
|
|
switch (data[n]) {
|
2011-03-23 00:24:02 +01:00
|
|
|
case kSnappyCompression: {
|
2011-07-21 04:40:18 +02:00
|
|
|
size_t ulength = 0;
|
2012-12-20 23:25:06 +01:00
|
|
|
static char snappy_corrupt_msg[] =
|
|
|
|
"Snappy not supported or corrupted Snappy compressed block contents";
|
2011-07-21 04:40:18 +02:00
|
|
|
if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(snappy_corrupt_msg);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2012-06-28 08:41:33 +02:00
|
|
|
ubuf = new char[ulength];
|
2011-07-21 04:40:18 +02:00
|
|
|
if (!port::Snappy_Uncompress(data, n, ubuf)) {
|
|
|
|
delete[] ubuf;
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(snappy_corrupt_msg);
|
2011-07-21 04:40:18 +02:00
|
|
|
}
|
2012-04-17 17:36:46 +02:00
|
|
|
result->data = Slice(ubuf, ulength);
|
|
|
|
result->heap_allocated = true;
|
|
|
|
result->cachable = true;
|
2011-03-18 23:37:00 +01:00
|
|
|
break;
|
|
|
|
}
|
2012-06-28 08:41:33 +02:00
|
|
|
case kZlibCompression:
|
|
|
|
ubuf = port::Zlib_Uncompress(data, n, &decompress_size);
|
2012-12-20 23:25:06 +01:00
|
|
|
static char zlib_corrupt_msg[] =
|
|
|
|
"Zlib not supported or corrupted Zlib compressed block contents";
|
2012-06-28 08:41:33 +02:00
|
|
|
if (!ubuf) {
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(zlib_corrupt_msg);
|
2012-06-28 08:41:33 +02:00
|
|
|
}
|
|
|
|
result->data = Slice(ubuf, decompress_size);
|
|
|
|
result->heap_allocated = true;
|
|
|
|
result->cachable = true;
|
|
|
|
break;
|
2012-06-29 04:26:43 +02:00
|
|
|
case kBZip2Compression:
|
|
|
|
ubuf = port::BZip2_Uncompress(data, n, &decompress_size);
|
2012-12-20 23:25:06 +01:00
|
|
|
static char bzip2_corrupt_msg[] =
|
|
|
|
"Bzip2 not supported or corrupted Bzip2 compressed block contents";
|
2012-06-29 04:26:43 +02:00
|
|
|
if (!ubuf) {
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(bzip2_corrupt_msg);
|
2012-06-29 04:26:43 +02:00
|
|
|
}
|
|
|
|
result->data = Slice(ubuf, decompress_size);
|
|
|
|
result->heap_allocated = true;
|
|
|
|
result->cachable = true;
|
|
|
|
break;
|
2014-02-08 03:12:30 +01:00
|
|
|
case kLZ4Compression:
|
|
|
|
ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
|
|
|
|
static char lz4_corrupt_msg[] =
|
|
|
|
"LZ4 not supported or corrupted LZ4 compressed block contents";
|
|
|
|
if (!ubuf) {
|
|
|
|
return Status::Corruption(lz4_corrupt_msg);
|
|
|
|
}
|
|
|
|
result->data = Slice(ubuf, decompress_size);
|
|
|
|
result->heap_allocated = true;
|
|
|
|
result->cachable = true;
|
|
|
|
break;
|
|
|
|
case kLZ4HCCompression:
|
|
|
|
ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
|
|
|
|
static char lz4hc_corrupt_msg[] =
|
|
|
|
"LZ4HC not supported or corrupted LZ4HC compressed block contents";
|
|
|
|
if (!ubuf) {
|
|
|
|
return Status::Corruption(lz4hc_corrupt_msg);
|
|
|
|
}
|
|
|
|
result->data = Slice(ubuf, decompress_size);
|
|
|
|
result->heap_allocated = true;
|
|
|
|
result->cachable = true;
|
|
|
|
break;
|
2011-03-18 23:37:00 +01:00
|
|
|
default:
|
|
|
|
return Status::Corruption("bad block type");
|
|
|
|
}
|
2014-02-05 01:21:47 +01:00
|
|
|
result->compression_type = kNoCompression; // not compressed any more
|
2011-03-18 23:37:00 +01:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
} // namespace rocksdb
|