36a5f8ed7f
- Replace raw slice comparison with a call to user comparator. Added test for custom comparators. - Fix end of namespace comments. - Fixed bug in picking inputs for a level-0 compaction. When finding overlapping files, the covered range may expand as files are added to the input set. We now correctly expand the range when this happens instead of continuing to use the old range. For example, suppose L0 contains files with the following ranges: F1: a .. d F2: c .. g F3: f .. j and the initial compaction target is F3. We used to search for range f..j which yielded {F2,F3}. However we now expand the range as soon as another file is added. In this case, when F2 is added, we expand the range to c..j and restart the search. That picks up file F1 as well. This change fixes a bug related to deleted keys showing up incorrectly after a compaction as described in Issue 44. (Sync with upstream @25072954)
260 lines
7.8 KiB
C++
260 lines
7.8 KiB
C++
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include <stdio.h>
|
|
#include "leveldb/env.h"
|
|
#include "util/coding.h"
|
|
#include "util/crc32c.h"
|
|
|
|
namespace leveldb {
|
|
namespace log {
|
|
|
|
Reader::Reporter::~Reporter() {
|
|
}
|
|
|
|
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
|
|
uint64_t initial_offset)
|
|
: file_(file),
|
|
reporter_(reporter),
|
|
checksum_(checksum),
|
|
backing_store_(new char[kBlockSize]),
|
|
buffer_(),
|
|
eof_(false),
|
|
last_record_offset_(0),
|
|
end_of_buffer_offset_(0),
|
|
initial_offset_(initial_offset) {
|
|
}
|
|
|
|
Reader::~Reader() {
|
|
delete[] backing_store_;
|
|
}
|
|
|
|
bool Reader::SkipToInitialBlock() {
|
|
size_t offset_in_block = initial_offset_ % kBlockSize;
|
|
uint64_t block_start_location = initial_offset_ - offset_in_block;
|
|
|
|
// Don't search a block if we'd be in the trailer
|
|
if (offset_in_block > kBlockSize - 6) {
|
|
offset_in_block = 0;
|
|
block_start_location += kBlockSize;
|
|
}
|
|
|
|
end_of_buffer_offset_ = block_start_location;
|
|
|
|
// Skip to start of first block that can contain the initial record
|
|
if (block_start_location > 0) {
|
|
Status skip_status = file_->Skip(block_start_location);
|
|
if (!skip_status.ok()) {
|
|
ReportDrop(block_start_location, skip_status);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
|
if (last_record_offset_ < initial_offset_) {
|
|
if (!SkipToInitialBlock()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
scratch->clear();
|
|
record->clear();
|
|
bool in_fragmented_record = false;
|
|
// Record offset of the logical record that we're reading
|
|
// 0 is a dummy value to make compilers happy
|
|
uint64_t prospective_record_offset = 0;
|
|
|
|
Slice fragment;
|
|
while (true) {
|
|
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
|
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
|
switch (record_type) {
|
|
case kFullType:
|
|
if (in_fragmented_record) {
|
|
// Handle bug in earlier versions of log::Writer where
|
|
// it could emit an empty kFirstType record at the tail end
|
|
// of a block followed by a kFullType or kFirstType record
|
|
// at the beginning of the next block.
|
|
if (scratch->empty()) {
|
|
in_fragmented_record = false;
|
|
} else {
|
|
ReportCorruption(scratch->size(), "partial record without end(1)");
|
|
}
|
|
}
|
|
prospective_record_offset = physical_record_offset;
|
|
scratch->clear();
|
|
*record = fragment;
|
|
last_record_offset_ = prospective_record_offset;
|
|
return true;
|
|
|
|
case kFirstType:
|
|
if (in_fragmented_record) {
|
|
// Handle bug in earlier versions of log::Writer where
|
|
// it could emit an empty kFirstType record at the tail end
|
|
// of a block followed by a kFullType or kFirstType record
|
|
// at the beginning of the next block.
|
|
if (scratch->empty()) {
|
|
in_fragmented_record = false;
|
|
} else {
|
|
ReportCorruption(scratch->size(), "partial record without end(2)");
|
|
}
|
|
}
|
|
prospective_record_offset = physical_record_offset;
|
|
scratch->assign(fragment.data(), fragment.size());
|
|
in_fragmented_record = true;
|
|
break;
|
|
|
|
case kMiddleType:
|
|
if (!in_fragmented_record) {
|
|
ReportCorruption(fragment.size(),
|
|
"missing start of fragmented record(1)");
|
|
} else {
|
|
scratch->append(fragment.data(), fragment.size());
|
|
}
|
|
break;
|
|
|
|
case kLastType:
|
|
if (!in_fragmented_record) {
|
|
ReportCorruption(fragment.size(),
|
|
"missing start of fragmented record(2)");
|
|
} else {
|
|
scratch->append(fragment.data(), fragment.size());
|
|
*record = Slice(*scratch);
|
|
last_record_offset_ = prospective_record_offset;
|
|
return true;
|
|
}
|
|
break;
|
|
|
|
case kEof:
|
|
if (in_fragmented_record) {
|
|
ReportCorruption(scratch->size(), "partial record without end(3)");
|
|
scratch->clear();
|
|
}
|
|
return false;
|
|
|
|
case kBadRecord:
|
|
if (in_fragmented_record) {
|
|
ReportCorruption(scratch->size(), "error in middle of record");
|
|
in_fragmented_record = false;
|
|
scratch->clear();
|
|
}
|
|
break;
|
|
|
|
default: {
|
|
char buf[40];
|
|
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
|
|
ReportCorruption(
|
|
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
|
|
buf);
|
|
in_fragmented_record = false;
|
|
scratch->clear();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
uint64_t Reader::LastRecordOffset() {
|
|
return last_record_offset_;
|
|
}
|
|
|
|
void Reader::ReportCorruption(size_t bytes, const char* reason) {
|
|
ReportDrop(bytes, Status::Corruption(reason));
|
|
}
|
|
|
|
void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
|
if (reporter_ != NULL &&
|
|
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
|
reporter_->Corruption(bytes, reason);
|
|
}
|
|
}
|
|
|
|
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|
while (true) {
|
|
if (buffer_.size() < kHeaderSize) {
|
|
if (!eof_) {
|
|
// Last read was a full read, so this is a trailer to skip
|
|
buffer_.clear();
|
|
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
|
end_of_buffer_offset_ += buffer_.size();
|
|
if (!status.ok()) {
|
|
buffer_.clear();
|
|
ReportDrop(kBlockSize, status);
|
|
eof_ = true;
|
|
return kEof;
|
|
} else if (buffer_.size() < kBlockSize) {
|
|
eof_ = true;
|
|
}
|
|
continue;
|
|
} else if (buffer_.size() == 0) {
|
|
// End of file
|
|
return kEof;
|
|
} else {
|
|
size_t drop_size = buffer_.size();
|
|
buffer_.clear();
|
|
ReportCorruption(drop_size, "truncated record at end of file");
|
|
return kEof;
|
|
}
|
|
}
|
|
|
|
// Parse the header
|
|
const char* header = buffer_.data();
|
|
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
|
|
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
|
|
const unsigned int type = header[6];
|
|
const uint32_t length = a | (b << 8);
|
|
if (kHeaderSize + length > buffer_.size()) {
|
|
size_t drop_size = buffer_.size();
|
|
buffer_.clear();
|
|
ReportCorruption(drop_size, "bad record length");
|
|
return kBadRecord;
|
|
}
|
|
|
|
if (type == kZeroType && length == 0) {
|
|
// Skip zero length record without reporting any drops since
|
|
// such records are produced by the mmap based writing code in
|
|
// env_posix.cc that preallocates file regions.
|
|
buffer_.clear();
|
|
return kBadRecord;
|
|
}
|
|
|
|
// Check crc
|
|
if (checksum_) {
|
|
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
|
|
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
|
|
if (actual_crc != expected_crc) {
|
|
// Drop the rest of the buffer since "length" itself may have
|
|
// been corrupted and if we trust it, we could find some
|
|
// fragment of a real log record that just happens to look
|
|
// like a valid log record.
|
|
size_t drop_size = buffer_.size();
|
|
buffer_.clear();
|
|
ReportCorruption(drop_size, "checksum mismatch");
|
|
return kBadRecord;
|
|
}
|
|
}
|
|
|
|
buffer_.remove_prefix(kHeaderSize + length);
|
|
|
|
// Skip physical record that started before initial_offset_
|
|
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
|
|
initial_offset_) {
|
|
result->clear();
|
|
return kBadRecord;
|
|
}
|
|
|
|
*result = Slice(header + kHeaderSize, length);
|
|
return type;
|
|
}
|
|
}
|
|
|
|
} // namespace log
|
|
} // namespace leveldb
|