2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2013-10-16 23:59:46 +02:00
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "table/format.h"
|
|
|
|
|
2014-02-05 01:21:47 +01:00
|
|
|
#include <string>
|
2014-02-08 04:26:49 +01:00
|
|
|
#include <inttypes.h>
|
2014-02-05 01:21:47 +01:00
|
|
|
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
#include "rocksdb/env.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/block.h"
|
2015-12-16 03:20:10 +01:00
|
|
|
#include "table/block_based_table_reader.h"
|
|
|
|
#include "table/persistent_cache_helper.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "util/coding.h"
|
2015-01-09 21:57:11 +01:00
|
|
|
#include "util/compression.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "util/crc32c.h"
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
#include "util/file_reader_writer.h"
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
#include "util/perf_context_imp.h"
|
2015-04-24 04:17:57 +02:00
|
|
|
#include "util/string_util.h"
|
2014-05-01 20:09:32 +02:00
|
|
|
#include "util/xxhash.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-05-01 20:09:32 +02:00
|
|
|
extern const uint64_t kLegacyBlockBasedTableMagicNumber;
|
|
|
|
extern const uint64_t kBlockBasedTableMagicNumber;
|
2014-05-08 02:45:27 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
extern const uint64_t kLegacyPlainTableMagicNumber;
|
2014-05-01 20:09:32 +02:00
|
|
|
extern const uint64_t kPlainTableMagicNumber;
|
2014-05-08 02:45:27 +02:00
|
|
|
#else
|
|
|
|
// ROCKSDB_LITE doesn't have plain table
|
|
|
|
const uint64_t kLegacyPlainTableMagicNumber = 0;
|
|
|
|
const uint64_t kPlainTableMagicNumber = 0;
|
|
|
|
#endif
|
2014-07-31 08:11:59 +02:00
|
|
|
const uint32_t DefaultStackBufferSize = 5000;
|
2014-05-01 20:09:32 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
void BlockHandle::EncodeTo(std::string* dst) const {
|
|
|
|
// Sanity check that all fields have been set
|
|
|
|
assert(offset_ != ~static_cast<uint64_t>(0));
|
|
|
|
assert(size_ != ~static_cast<uint64_t>(0));
|
|
|
|
PutVarint64(dst, offset_);
|
|
|
|
PutVarint64(dst, size_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockHandle::DecodeFrom(Slice* input) {
|
|
|
|
if (GetVarint64(input, &offset_) &&
|
|
|
|
GetVarint64(input, &size_)) {
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
return Status::Corruption("bad block handle");
|
|
|
|
}
|
|
|
|
}
|
2014-12-23 22:24:07 +01:00
|
|
|
|
|
|
|
// Return a string that contains the copy of handle.
|
|
|
|
std::string BlockHandle::ToString(bool hex) const {
|
|
|
|
std::string handle_str;
|
|
|
|
EncodeTo(&handle_str);
|
|
|
|
if (hex) {
|
2016-03-30 06:25:12 +02:00
|
|
|
return Slice(handle_str).ToString(true);
|
2014-12-23 22:24:07 +01:00
|
|
|
} else {
|
|
|
|
return handle_str;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-12-05 00:43:09 +01:00
|
|
|
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2015-01-13 23:33:04 +01:00
|
|
|
namespace {
|
|
|
|
inline bool IsLegacyFooterFormat(uint64_t magic_number) {
|
|
|
|
return magic_number == kLegacyBlockBasedTableMagicNumber ||
|
|
|
|
magic_number == kLegacyPlainTableMagicNumber;
|
|
|
|
}
|
|
|
|
inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
|
|
|
|
if (magic_number == kLegacyBlockBasedTableMagicNumber) {
|
|
|
|
return kBlockBasedTableMagicNumber;
|
|
|
|
}
|
|
|
|
if (magic_number == kLegacyPlainTableMagicNumber) {
|
|
|
|
return kPlainTableMagicNumber;
|
|
|
|
}
|
|
|
|
assert(false);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
2014-05-01 20:09:32 +02:00
|
|
|
// legacy footer format:
|
|
|
|
// metaindex handle (varint64 offset, varint64 size)
|
|
|
|
// index handle (varint64 offset, varint64 size)
|
|
|
|
// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
|
|
|
|
// table_magic_number (8 bytes)
|
|
|
|
// new footer format:
|
|
|
|
// checksum (char, 1 byte)
|
|
|
|
// metaindex handle (varint64 offset, varint64 size)
|
|
|
|
// index handle (varint64 offset, varint64 size)
|
|
|
|
// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
|
|
|
|
// footer version (4 bytes)
|
|
|
|
// table_magic_number (8 bytes)
|
2011-03-18 23:37:00 +01:00
|
|
|
void Footer::EncodeTo(std::string* dst) const {
|
2015-01-13 23:33:04 +01:00
|
|
|
assert(HasInitializedTableMagicNumber());
|
|
|
|
if (IsLegacyFooterFormat(table_magic_number())) {
|
2014-05-01 20:09:32 +02:00
|
|
|
// has to be default checksum with legacy footer
|
|
|
|
assert(checksum_ == kCRC32c);
|
|
|
|
const size_t original_size = dst->size();
|
|
|
|
metaindex_handle_.EncodeTo(dst);
|
|
|
|
index_handle_.EncodeTo(dst);
|
|
|
|
dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
|
|
|
|
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
|
|
|
|
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
|
|
|
|
assert(dst->size() == original_size + kVersion0EncodedLength);
|
|
|
|
} else {
|
|
|
|
const size_t original_size = dst->size();
|
|
|
|
dst->push_back(static_cast<char>(checksum_));
|
|
|
|
metaindex_handle_.EncodeTo(dst);
|
|
|
|
index_handle_.EncodeTo(dst);
|
2015-01-13 23:33:04 +01:00
|
|
|
dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
|
|
|
|
PutFixed32(dst, version());
|
2014-05-01 20:09:32 +02:00
|
|
|
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
|
|
|
|
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
|
2015-01-13 23:33:04 +01:00
|
|
|
assert(dst->size() == original_size + kNewVersionsEncodedLength);
|
2014-05-01 20:09:32 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2014-05-01 20:09:32 +02:00
|
|
|
|
2015-01-13 23:33:04 +01:00
|
|
|
Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
|
|
|
|
: version_(_version),
|
2014-05-01 20:09:32 +02:00
|
|
|
checksum_(kCRC32c),
|
2015-01-13 23:33:04 +01:00
|
|
|
table_magic_number_(_table_magic_number) {
|
|
|
|
// This should be guaranteed by constructor callers
|
|
|
|
assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
Status Footer::DecodeFrom(Slice* input) {
|
2015-01-13 23:33:04 +01:00
|
|
|
assert(!HasInitializedTableMagicNumber());
|
2013-03-01 03:04:58 +01:00
|
|
|
assert(input != nullptr);
|
2014-05-01 20:09:32 +02:00
|
|
|
assert(input->size() >= kMinEncodedLength);
|
2013-01-09 19:44:30 +01:00
|
|
|
|
2014-05-01 20:09:32 +02:00
|
|
|
const char *magic_ptr =
|
|
|
|
input->data() + input->size() - kMagicNumberLengthByte;
|
2011-03-18 23:37:00 +01:00
|
|
|
const uint32_t magic_lo = DecodeFixed32(magic_ptr);
|
|
|
|
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
|
2014-05-01 20:09:32 +02:00
|
|
|
uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
|
|
|
|
(static_cast<uint64_t>(magic_lo)));
|
|
|
|
|
|
|
|
// We check for legacy formats here and silently upconvert them
|
|
|
|
bool legacy = IsLegacyFooterFormat(magic);
|
|
|
|
if (legacy) {
|
|
|
|
magic = UpconvertLegacyFooterFormat(magic);
|
|
|
|
}
|
2015-01-13 23:33:04 +01:00
|
|
|
set_table_magic_number(magic);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-05-01 20:09:32 +02:00
|
|
|
if (legacy) {
|
|
|
|
// The size is already asserted to be at least kMinEncodedLength
|
|
|
|
// at the beginning of the function
|
|
|
|
input->remove_prefix(input->size() - kVersion0EncodedLength);
|
2015-01-13 23:33:04 +01:00
|
|
|
version_ = 0 /* legacy */;
|
2014-05-01 20:09:32 +02:00
|
|
|
checksum_ = kCRC32c;
|
|
|
|
} else {
|
|
|
|
version_ = DecodeFixed32(magic_ptr - 4);
|
2015-01-13 23:33:04 +01:00
|
|
|
// Footer version 1 and higher will always occupy exactly this many bytes.
|
2014-05-01 20:09:32 +02:00
|
|
|
// It consists of the checksum type, two block handles, padding,
|
|
|
|
// a version number, and a magic number
|
2015-01-13 23:33:04 +01:00
|
|
|
if (input->size() < kNewVersionsEncodedLength) {
|
2014-09-11 02:00:00 +02:00
|
|
|
return Status::Corruption("input is too short to be an sstable");
|
2014-05-01 20:09:32 +02:00
|
|
|
} else {
|
2015-01-13 23:33:04 +01:00
|
|
|
input->remove_prefix(input->size() - kNewVersionsEncodedLength);
|
2014-05-01 20:09:32 +02:00
|
|
|
}
|
2014-11-06 20:14:28 +01:00
|
|
|
uint32_t chksum;
|
|
|
|
if (!GetVarint32(input, &chksum)) {
|
2014-05-01 20:09:32 +02:00
|
|
|
return Status::Corruption("bad checksum type");
|
|
|
|
}
|
2014-11-06 20:14:28 +01:00
|
|
|
checksum_ = static_cast<ChecksumType>(chksum);
|
2014-05-01 20:09:32 +02:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Status result = metaindex_handle_.DecodeFrom(input);
|
|
|
|
if (result.ok()) {
|
|
|
|
result = index_handle_.DecodeFrom(input);
|
|
|
|
}
|
|
|
|
if (result.ok()) {
|
|
|
|
// We skip over any leftover data (just padding for now) in "input"
|
2014-05-01 20:09:32 +02:00
|
|
|
const char* end = magic_ptr + kMagicNumberLengthByte;
|
2011-03-18 23:37:00 +01:00
|
|
|
*input = Slice(end, input->data() + input->size() - end);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-12-23 22:24:07 +01:00
|
|
|
std::string Footer::ToString() const {
|
|
|
|
std::string result, handle_;
|
|
|
|
result.reserve(1024);
|
|
|
|
|
|
|
|
bool legacy = IsLegacyFooterFormat(table_magic_number_);
|
|
|
|
if (legacy) {
|
|
|
|
result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
|
|
|
|
result.append("index handle: " + index_handle_.ToString() + "\n ");
|
2015-04-24 04:17:57 +02:00
|
|
|
result.append("table_magic_number: " +
|
|
|
|
rocksdb::ToString(table_magic_number_) + "\n ");
|
2014-12-23 22:24:07 +01:00
|
|
|
} else {
|
2015-04-24 04:17:57 +02:00
|
|
|
result.append("checksum: " + rocksdb::ToString(checksum_) + "\n ");
|
2014-12-23 22:24:07 +01:00
|
|
|
result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
|
|
|
|
result.append("index handle: " + index_handle_.ToString() + "\n ");
|
2015-04-24 04:17:57 +02:00
|
|
|
result.append("footer version: " + rocksdb::ToString(version_) + "\n ");
|
|
|
|
result.append("table_magic_number: " +
|
|
|
|
rocksdb::ToString(table_magic_number_) + "\n ");
|
2014-12-23 22:24:07 +01:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
|
2015-01-13 23:33:04 +01:00
|
|
|
Footer* footer, uint64_t enforce_table_magic_number) {
|
2014-05-01 20:09:32 +02:00
|
|
|
if (file_size < Footer::kMinEncodedLength) {
|
2014-09-11 02:00:00 +02:00
|
|
|
return Status::Corruption("file is too short to be an sstable");
|
2013-12-05 01:35:48 +01:00
|
|
|
}
|
|
|
|
|
2014-05-01 20:09:32 +02:00
|
|
|
char footer_space[Footer::kMaxEncodedLength];
|
2013-12-05 01:35:48 +01:00
|
|
|
Slice footer_input;
|
2014-11-13 20:39:30 +01:00
|
|
|
size_t read_offset =
|
|
|
|
(file_size > Footer::kMaxEncodedLength)
|
|
|
|
? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
|
|
|
|
: 0;
|
2014-05-01 20:09:32 +02:00
|
|
|
Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
|
2013-12-05 01:35:48 +01:00
|
|
|
footer_space);
|
|
|
|
if (!s.ok()) return s;
|
|
|
|
|
|
|
|
// Check that we actually read the whole footer from the file. It may be
|
|
|
|
// that size isn't correct.
|
2014-05-01 20:09:32 +02:00
|
|
|
if (footer_input.size() < Footer::kMinEncodedLength) {
|
2014-09-11 02:00:00 +02:00
|
|
|
return Status::Corruption("file is too short to be an sstable");
|
2013-12-05 01:35:48 +01:00
|
|
|
}
|
|
|
|
|
2015-01-13 23:33:04 +01:00
|
|
|
s = footer->DecodeFrom(&footer_input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (enforce_table_magic_number != 0 &&
|
|
|
|
enforce_table_magic_number != footer->table_magic_number()) {
|
|
|
|
return Status::Corruption("Bad table magic number");
|
|
|
|
}
|
|
|
|
return Status::OK();
|
2013-12-05 01:35:48 +01:00
|
|
|
}
|
|
|
|
|
2014-11-13 20:39:30 +01:00
|
|
|
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
|
|
|
|
namespace {
|
|
|
|
|
2014-07-31 08:11:59 +02:00
|
|
|
// Read a block and check its CRC
|
|
|
|
// contents is the result of reading.
|
|
|
|
// According to the implementation of file->Read, contents may not point to buf
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status ReadBlock(RandomAccessFileReader* file, const Footer& footer,
|
|
|
|
const ReadOptions& options, const BlockHandle& handle,
|
|
|
|
Slice* contents, /* result of reading */ char* buf) {
|
2011-04-21 00:48:11 +02:00
|
|
|
size_t n = static_cast<size_t>(handle.size());
|
2014-08-23 00:28:58 +02:00
|
|
|
Status s;
|
|
|
|
|
|
|
|
{
|
|
|
|
PERF_TIMER_GUARD(block_read_time);
|
|
|
|
s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
|
|
|
|
}
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_COUNTER_ADD(block_read_count, 1);
|
|
|
|
PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
|
[RocksDB] Added nano second stopwatch and new perf counters to track block read cost
Summary: The pupose of this diff is to expose per user-call level precise timing of block read, so that we can answer questions like: a Get() costs me 100ms, is that somehow related to loading blocks from file system, or sth else? We will answer that with EXACTLY how many blocks have been read, how much time was spent on transfering the bytes from os, how much time was spent on checksum verification and how much time was spent on block decompression, just for that one Get. A nano second stopwatch was introduced to track time with higher precision. The cost/precision of the stopwatch is also measured in unit-test. On my dev box, retrieving one time instance costs about 30ns, on average. The deviation of timing results is good enough to track 100ns-1us level events. And the overhead could be safely ignored for 100us level events (10000 instances/s), for example, a viewstate thrift call.
Test Plan: perf_context_test, also testing with viewstate shadow traffic.
Reviewers: dhruba
Reviewed By: dhruba
CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D12351
2013-06-04 08:09:15 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-07-31 08:11:59 +02:00
|
|
|
if (contents->size() != n + kBlockTrailerSize) {
|
2011-03-18 23:37:00 +01:00
|
|
|
return Status::Corruption("truncated block read");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check the crc of the type and the block contents
|
2014-07-31 08:11:59 +02:00
|
|
|
const char* data = contents->data(); // Pointer to where Read put the data
|
2011-03-18 23:37:00 +01:00
|
|
|
if (options.verify_checksums) {
|
2014-08-23 00:28:58 +02:00
|
|
|
PERF_TIMER_GUARD(block_checksum_time);
|
2014-05-01 20:09:32 +02:00
|
|
|
uint32_t value = DecodeFixed32(data + n + 1);
|
2014-05-01 20:12:35 +02:00
|
|
|
uint32_t actual = 0;
|
2014-05-01 20:09:32 +02:00
|
|
|
switch (footer.checksum()) {
|
|
|
|
case kCRC32c:
|
|
|
|
value = crc32c::Unmask(value);
|
|
|
|
actual = crc32c::Value(data, n + 1);
|
|
|
|
break;
|
|
|
|
case kxxHash:
|
2014-11-11 22:47:22 +01:00
|
|
|
actual = XXH32(data, static_cast<int>(n) + 1, 0);
|
2014-05-01 20:09:32 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
s = Status::Corruption("unknown checksum type");
|
|
|
|
}
|
|
|
|
if (s.ok() && actual != value) {
|
2011-03-18 23:37:00 +01:00
|
|
|
s = Status::Corruption("block checksum mismatch");
|
2014-05-01 20:09:32 +02:00
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
2014-07-31 08:11:59 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-11-13 20:39:30 +01:00
|
|
|
} // namespace
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
|
2015-12-16 03:20:10 +01:00
|
|
|
const ReadOptions& read_options,
|
|
|
|
const BlockHandle& handle, BlockContents* contents,
|
|
|
|
Env* env, bool decompression_requested,
|
|
|
|
const Slice& compression_dict,
|
|
|
|
const PersistentCacheOptions& cache_options,
|
|
|
|
Logger* info_log) {
|
2014-08-16 00:05:09 +02:00
|
|
|
Status status;
|
|
|
|
Slice slice;
|
|
|
|
size_t n = static_cast<size_t>(handle.size());
|
|
|
|
std::unique_ptr<char[]> heap_buf;
|
|
|
|
char stack_buf[DefaultStackBufferSize];
|
2014-09-18 00:08:19 +02:00
|
|
|
char* used_buf = nullptr;
|
2014-08-16 00:05:09 +02:00
|
|
|
rocksdb::CompressionType compression_type;
|
|
|
|
|
2015-12-16 03:20:10 +01:00
|
|
|
if (cache_options.persistent_cache &&
|
|
|
|
!cache_options.persistent_cache->IsCompressed()) {
|
|
|
|
status = PersistentCacheHelper::LookupUncompressedPage(cache_options,
|
|
|
|
handle, contents);
|
|
|
|
if (status.ok()) {
|
|
|
|
// uncompressed page is found for the block handle
|
|
|
|
return status;
|
|
|
|
} else {
|
|
|
|
// uncompressed page is not found
|
|
|
|
if (info_log && !status.IsNotFound()) {
|
|
|
|
assert(!status.ok());
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, info_log,
|
|
|
|
"Error reading from persistent cache. %s",
|
|
|
|
status.ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cache_options.persistent_cache &&
|
|
|
|
cache_options.persistent_cache->IsCompressed()) {
|
|
|
|
// lookup uncompressed cache mode p-cache
|
|
|
|
status = PersistentCacheHelper::LookupRawPage(
|
|
|
|
cache_options, handle, &heap_buf, n + kBlockTrailerSize);
|
2013-09-02 08:23:40 +02:00
|
|
|
} else {
|
2015-12-16 03:20:10 +01:00
|
|
|
status = Status::NotFound();
|
2013-09-02 08:23:40 +02:00
|
|
|
}
|
|
|
|
|
2015-12-16 03:20:10 +01:00
|
|
|
if (status.ok()) {
|
|
|
|
// cache hit
|
|
|
|
used_buf = heap_buf.get();
|
|
|
|
slice = Slice(heap_buf.get(), n);
|
|
|
|
} else {
|
|
|
|
if (info_log && !status.IsNotFound()) {
|
|
|
|
assert(!status.ok());
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, info_log,
|
|
|
|
"Error reading from persistent cache. %s", status.ToString().c_str());
|
|
|
|
}
|
|
|
|
// cache miss read from device
|
|
|
|
if (decompression_requested &&
|
|
|
|
n + kBlockTrailerSize < DefaultStackBufferSize) {
|
|
|
|
// If we've got a small enough hunk of data, read it in to the
|
|
|
|
// trivially allocated stack buffer instead of needing a full malloc()
|
|
|
|
used_buf = &stack_buf[0];
|
|
|
|
} else {
|
|
|
|
heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
|
|
|
|
used_buf = heap_buf.get();
|
|
|
|
}
|
|
|
|
|
|
|
|
status = ReadBlock(file, footer, read_options, handle, &slice, used_buf);
|
|
|
|
if (status.ok() && read_options.fill_cache &&
|
|
|
|
cache_options.persistent_cache &&
|
|
|
|
cache_options.persistent_cache->IsCompressed()) {
|
|
|
|
// insert to raw cache
|
|
|
|
PersistentCacheHelper::InsertRawPage(cache_options, handle, used_buf,
|
|
|
|
n + kBlockTrailerSize);
|
|
|
|
}
|
|
|
|
}
|
2014-07-31 08:11:59 +02:00
|
|
|
|
2014-08-16 00:05:09 +02:00
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
2014-07-31 08:11:59 +02:00
|
|
|
}
|
|
|
|
|
2014-08-16 00:05:09 +02:00
|
|
|
PERF_TIMER_GUARD(block_decompress_time);
|
2014-07-31 08:11:59 +02:00
|
|
|
|
2014-08-16 00:05:09 +02:00
|
|
|
compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]);
|
|
|
|
|
|
|
|
if (decompression_requested && compression_type != kNoCompression) {
|
2015-12-16 03:20:10 +01:00
|
|
|
// compressed page, uncompress, update cache
|
|
|
|
status = UncompressBlockContents(slice.data(), n, contents,
|
|
|
|
footer.version(), compression_dict);
|
|
|
|
} else if (slice.data() != used_buf) {
|
|
|
|
// the slice content is not the buffer provided
|
2014-08-16 00:05:09 +02:00
|
|
|
*contents = BlockContents(Slice(slice.data(), n), false, compression_type);
|
2015-12-16 03:20:10 +01:00
|
|
|
} else {
|
|
|
|
// page is uncompressed, the buffer either stack or heap provided
|
|
|
|
if (used_buf == &stack_buf[0]) {
|
|
|
|
heap_buf = std::unique_ptr<char[]>(new char[n]);
|
|
|
|
memcpy(heap_buf.get(), stack_buf, n);
|
|
|
|
}
|
|
|
|
*contents = BlockContents(std::move(heap_buf), n, true, compression_type);
|
2014-07-31 08:11:59 +02:00
|
|
|
}
|
|
|
|
|
2015-12-16 03:20:10 +01:00
|
|
|
if (status.ok() && read_options.fill_cache &&
|
|
|
|
cache_options.persistent_cache &&
|
|
|
|
!cache_options.persistent_cache->IsCompressed()) {
|
|
|
|
// insert to uncompressed cache
|
|
|
|
PersistentCacheHelper::InsertUncompressedPage(cache_options, handle,
|
|
|
|
*contents);
|
2014-07-31 08:11:59 +02:00
|
|
|
}
|
2014-08-16 00:05:09 +02:00
|
|
|
|
|
|
|
return status;
|
2014-07-31 08:11:59 +02:00
|
|
|
}
|
|
|
|
|
2016-06-11 03:20:54 +02:00
|
|
|
Status UncompressBlockContentsForCompressionType(
|
|
|
|
const char* data, size_t n, BlockContents* contents,
|
|
|
|
uint32_t format_version, const Slice& compression_dict,
|
|
|
|
CompressionType compression_type) {
|
2014-08-16 00:05:09 +02:00
|
|
|
std::unique_ptr<char[]> ubuf;
|
2016-06-11 03:20:54 +02:00
|
|
|
|
|
|
|
assert(compression_type != kNoCompression && "Invalid compression type");
|
|
|
|
|
2012-06-29 04:26:43 +02:00
|
|
|
int decompress_size = 0;
|
2016-06-11 03:20:54 +02:00
|
|
|
switch (compression_type) {
|
2011-03-23 00:24:02 +01:00
|
|
|
case kSnappyCompression: {
|
2011-07-21 04:40:18 +02:00
|
|
|
size_t ulength = 0;
|
2012-12-20 23:25:06 +01:00
|
|
|
static char snappy_corrupt_msg[] =
|
|
|
|
"Snappy not supported or corrupted Snappy compressed block contents";
|
2015-01-09 21:57:11 +01:00
|
|
|
if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(snappy_corrupt_msg);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2016-04-20 07:54:24 +02:00
|
|
|
ubuf.reset(new char[ulength]);
|
2015-01-09 21:57:11 +01:00
|
|
|
if (!Snappy_Uncompress(data, n, ubuf.get())) {
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(snappy_corrupt_msg);
|
2011-07-21 04:40:18 +02:00
|
|
|
}
|
2014-08-16 00:05:09 +02:00
|
|
|
*contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
|
2011-03-18 23:37:00 +01:00
|
|
|
break;
|
|
|
|
}
|
2012-06-28 08:41:33 +02:00
|
|
|
case kZlibCompression:
|
2016-04-20 07:54:24 +02:00
|
|
|
ubuf.reset(Zlib_Uncompress(
|
2015-01-15 01:24:24 +01:00
|
|
|
data, n, &decompress_size,
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-28 02:36:03 +02:00
|
|
|
GetCompressFormatForVersion(kZlibCompression, format_version),
|
|
|
|
compression_dict));
|
2012-06-28 08:41:33 +02:00
|
|
|
if (!ubuf) {
|
2014-10-01 07:27:39 +02:00
|
|
|
static char zlib_corrupt_msg[] =
|
|
|
|
"Zlib not supported or corrupted Zlib compressed block contents";
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(zlib_corrupt_msg);
|
2012-06-28 08:41:33 +02:00
|
|
|
}
|
2014-09-18 00:08:19 +02:00
|
|
|
*contents =
|
|
|
|
BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
|
2012-06-28 08:41:33 +02:00
|
|
|
break;
|
2012-06-29 04:26:43 +02:00
|
|
|
case kBZip2Compression:
|
2016-04-20 07:54:24 +02:00
|
|
|
ubuf.reset(BZip2_Uncompress(
|
2015-01-15 01:24:24 +01:00
|
|
|
data, n, &decompress_size,
|
|
|
|
GetCompressFormatForVersion(kBZip2Compression, format_version)));
|
2012-06-29 04:26:43 +02:00
|
|
|
if (!ubuf) {
|
2014-10-01 07:27:39 +02:00
|
|
|
static char bzip2_corrupt_msg[] =
|
|
|
|
"Bzip2 not supported or corrupted Bzip2 compressed block contents";
|
2012-12-20 23:25:06 +01:00
|
|
|
return Status::Corruption(bzip2_corrupt_msg);
|
2012-06-29 04:26:43 +02:00
|
|
|
}
|
2014-09-18 00:08:19 +02:00
|
|
|
*contents =
|
|
|
|
BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
|
2012-06-29 04:26:43 +02:00
|
|
|
break;
|
2014-02-08 03:12:30 +01:00
|
|
|
case kLZ4Compression:
|
2016-04-20 07:54:24 +02:00
|
|
|
ubuf.reset(LZ4_Uncompress(
|
2015-01-15 01:24:24 +01:00
|
|
|
data, n, &decompress_size,
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-28 02:36:03 +02:00
|
|
|
GetCompressFormatForVersion(kLZ4Compression, format_version),
|
|
|
|
compression_dict));
|
2014-02-08 03:12:30 +01:00
|
|
|
if (!ubuf) {
|
2014-10-01 07:27:39 +02:00
|
|
|
static char lz4_corrupt_msg[] =
|
|
|
|
"LZ4 not supported or corrupted LZ4 compressed block contents";
|
2014-02-08 03:12:30 +01:00
|
|
|
return Status::Corruption(lz4_corrupt_msg);
|
|
|
|
}
|
2014-09-18 00:08:19 +02:00
|
|
|
*contents =
|
|
|
|
BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
|
2014-02-08 03:12:30 +01:00
|
|
|
break;
|
|
|
|
case kLZ4HCCompression:
|
2016-04-20 07:54:24 +02:00
|
|
|
ubuf.reset(LZ4_Uncompress(
|
2015-01-15 01:24:24 +01:00
|
|
|
data, n, &decompress_size,
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-28 02:36:03 +02:00
|
|
|
GetCompressFormatForVersion(kLZ4HCCompression, format_version),
|
|
|
|
compression_dict));
|
2014-02-08 03:12:30 +01:00
|
|
|
if (!ubuf) {
|
2014-10-01 07:27:39 +02:00
|
|
|
static char lz4hc_corrupt_msg[] =
|
|
|
|
"LZ4HC not supported or corrupted LZ4HC compressed block contents";
|
2014-02-08 03:12:30 +01:00
|
|
|
return Status::Corruption(lz4hc_corrupt_msg);
|
|
|
|
}
|
2014-09-18 00:08:19 +02:00
|
|
|
*contents =
|
|
|
|
BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
|
2014-02-08 03:12:30 +01:00
|
|
|
break;
|
2016-04-20 07:54:24 +02:00
|
|
|
case kXpressCompression:
|
|
|
|
ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
|
|
|
|
if (!ubuf) {
|
|
|
|
static char xpress_corrupt_msg[] =
|
|
|
|
"XPRESS not supported or corrupted XPRESS compressed block contents";
|
|
|
|
return Status::Corruption(xpress_corrupt_msg);
|
|
|
|
}
|
|
|
|
*contents =
|
|
|
|
BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
|
|
|
|
break;
|
2015-08-28 00:40:42 +02:00
|
|
|
case kZSTDNotFinalCompression:
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-28 02:36:03 +02:00
|
|
|
ubuf.reset(ZSTD_Uncompress(data, n, &decompress_size, compression_dict));
|
2015-08-28 00:40:42 +02:00
|
|
|
if (!ubuf) {
|
|
|
|
static char zstd_corrupt_msg[] =
|
|
|
|
"ZSTD not supported or corrupted ZSTD compressed block contents";
|
|
|
|
return Status::Corruption(zstd_corrupt_msg);
|
|
|
|
}
|
|
|
|
*contents =
|
|
|
|
BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
|
|
|
|
break;
|
2011-03-18 23:37:00 +01:00
|
|
|
default:
|
|
|
|
return Status::Corruption("bad block type");
|
|
|
|
}
|
2015-12-16 03:20:10 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-06-11 03:20:54 +02:00
|
|
|
//
|
|
|
|
// The 'data' points to the raw block contents that was read in from file.
|
|
|
|
// This method allocates a new heap buffer and the raw block
|
|
|
|
// contents are uncompresed into this buffer. This
|
|
|
|
// buffer is returned via 'result' and it is upto the caller to
|
|
|
|
// free this buffer.
|
|
|
|
// format_version is the block format as defined in include/rocksdb/table.h
|
|
|
|
Status UncompressBlockContents(const char* data, size_t n,
|
|
|
|
BlockContents* contents, uint32_t format_version,
|
|
|
|
const Slice& compression_dict) {
|
|
|
|
assert(data[n] != kNoCompression);
|
|
|
|
return UncompressBlockContentsForCompressionType(
|
|
|
|
data, n, contents, format_version, compression_dict,
|
|
|
|
(CompressionType)data[n]);
|
|
|
|
}
|
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
} // namespace rocksdb
|