2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2011-03-30 20:35:40 +02:00
|
|
|
#include "leveldb/table.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2011-03-30 20:35:40 +02:00
|
|
|
#include "leveldb/cache.h"
|
2012-04-17 17:36:46 +02:00
|
|
|
#include "leveldb/comparator.h"
|
2011-03-30 20:35:40 +02:00
|
|
|
#include "leveldb/env.h"
|
2012-04-17 17:36:46 +02:00
|
|
|
#include "leveldb/filter_policy.h"
|
|
|
|
#include "leveldb/options.h"
|
2012-11-03 05:02:40 +01:00
|
|
|
#include "leveldb/statistics.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/block.h"
|
2012-04-17 17:36:46 +02:00
|
|
|
#include "table/filter_block.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/format.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
|
|
|
|
namespace leveldb {
|
|
|
|
|
|
|
|
struct Table::Rep {
|
|
|
|
~Rep() {
|
2012-04-17 17:36:46 +02:00
|
|
|
delete filter;
|
|
|
|
delete [] filter_data;
|
2011-03-18 23:37:00 +01:00
|
|
|
delete index_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
Status status;
|
|
|
|
RandomAccessFile* file;
|
|
|
|
uint64_t cache_id;
|
2012-04-17 17:36:46 +02:00
|
|
|
FilterBlockReader* filter;
|
|
|
|
const char* filter_data;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
|
|
|
|
Block* index_block;
|
|
|
|
};
|
|
|
|
|
|
|
|
Status Table::Open(const Options& options,
|
|
|
|
RandomAccessFile* file,
|
2011-03-28 22:43:44 +02:00
|
|
|
uint64_t size,
|
2011-03-18 23:37:00 +01:00
|
|
|
Table** table) {
|
|
|
|
*table = NULL;
|
|
|
|
if (size < Footer::kEncodedLength) {
|
|
|
|
return Status::InvalidArgument("file is too short to be an sstable");
|
|
|
|
}
|
|
|
|
|
|
|
|
char footer_space[Footer::kEncodedLength];
|
|
|
|
Slice footer_input;
|
|
|
|
Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
|
|
|
|
&footer_input, footer_space);
|
|
|
|
if (!s.ok()) return s;
|
|
|
|
|
|
|
|
Footer footer;
|
|
|
|
s = footer.DecodeFrom(&footer_input);
|
|
|
|
if (!s.ok()) return s;
|
|
|
|
|
|
|
|
// Read the index block
|
2012-04-17 17:36:46 +02:00
|
|
|
BlockContents contents;
|
2011-03-18 23:37:00 +01:00
|
|
|
Block* index_block = NULL;
|
|
|
|
if (s.ok()) {
|
2012-04-17 17:36:46 +02:00
|
|
|
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents);
|
|
|
|
if (s.ok()) {
|
|
|
|
index_block = new Block(contents);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// We've successfully read the footer and the index block: we're
|
|
|
|
// ready to serve requests.
|
|
|
|
Rep* rep = new Table::Rep;
|
|
|
|
rep->options = options;
|
|
|
|
rep->file = file;
|
|
|
|
rep->metaindex_handle = footer.metaindex_handle();
|
|
|
|
rep->index_block = index_block;
|
|
|
|
rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
|
2012-04-17 17:36:46 +02:00
|
|
|
rep->filter_data = NULL;
|
|
|
|
rep->filter = NULL;
|
2011-03-18 23:37:00 +01:00
|
|
|
*table = new Table(rep);
|
2012-04-17 17:36:46 +02:00
|
|
|
(*table)->ReadMeta(footer);
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
|
|
|
if (index_block) delete index_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2012-04-17 17:36:46 +02:00
|
|
|
void Table::ReadMeta(const Footer& footer) {
|
|
|
|
if (rep_->options.filter_policy == NULL) {
|
|
|
|
return; // Do not need any metadata
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
|
|
|
|
// it is an empty block.
|
|
|
|
ReadOptions opt;
|
|
|
|
BlockContents contents;
|
|
|
|
if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
|
|
|
|
// Do not propagate errors since meta info is not needed for operation
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
Block* meta = new Block(contents);
|
|
|
|
|
|
|
|
Iterator* iter = meta->NewIterator(BytewiseComparator());
|
|
|
|
std::string key = "filter.";
|
|
|
|
key.append(rep_->options.filter_policy->Name());
|
|
|
|
iter->Seek(key);
|
|
|
|
if (iter->Valid() && iter->key() == Slice(key)) {
|
|
|
|
ReadFilter(iter->value());
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
delete meta;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Table::ReadFilter(const Slice& filter_handle_value) {
|
|
|
|
Slice v = filter_handle_value;
|
|
|
|
BlockHandle filter_handle;
|
|
|
|
if (!filter_handle.DecodeFrom(&v).ok()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We might want to unify with ReadBlock() if we start
|
|
|
|
// requiring checksum verification in Table::Open.
|
|
|
|
ReadOptions opt;
|
|
|
|
BlockContents block;
|
|
|
|
if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (block.heap_allocated) {
|
|
|
|
rep_->filter_data = block.data.data(); // Will need to delete later
|
|
|
|
}
|
|
|
|
rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Table::~Table() {
|
|
|
|
delete rep_;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void DeleteBlock(void* arg, void* ignored) {
|
|
|
|
delete reinterpret_cast<Block*>(arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void DeleteCachedBlock(const Slice& key, void* value) {
|
|
|
|
Block* block = reinterpret_cast<Block*>(value);
|
|
|
|
delete block;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ReleaseBlock(void* arg, void* h) {
|
|
|
|
Cache* cache = reinterpret_cast<Cache*>(arg);
|
|
|
|
Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
|
|
|
|
cache->Release(handle);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convert an index iterator value (i.e., an encoded BlockHandle)
|
|
|
|
// into an iterator over the contents of the corresponding block.
|
|
|
|
Iterator* Table::BlockReader(void* arg,
|
|
|
|
const ReadOptions& options,
|
2012-09-27 10:05:38 +02:00
|
|
|
const Slice& index_value,
|
|
|
|
bool* didIO) {
|
2011-03-18 23:37:00 +01:00
|
|
|
Table* table = reinterpret_cast<Table*>(arg);
|
|
|
|
Cache* block_cache = table->rep_->options.block_cache;
|
2012-11-03 05:02:40 +01:00
|
|
|
Statistics* const statistics = table->rep_->options.statistics;
|
2011-03-18 23:37:00 +01:00
|
|
|
Block* block = NULL;
|
|
|
|
Cache::Handle* cache_handle = NULL;
|
|
|
|
|
|
|
|
BlockHandle handle;
|
|
|
|
Slice input = index_value;
|
|
|
|
Status s = handle.DecodeFrom(&input);
|
|
|
|
// We intentionally allow extra stuff in index_value so that we
|
|
|
|
// can add more features in the future.
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2012-04-17 17:36:46 +02:00
|
|
|
BlockContents contents;
|
2011-03-18 23:37:00 +01:00
|
|
|
if (block_cache != NULL) {
|
|
|
|
char cache_key_buffer[16];
|
|
|
|
EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
|
|
|
|
EncodeFixed64(cache_key_buffer+8, handle.offset());
|
|
|
|
Slice key(cache_key_buffer, sizeof(cache_key_buffer));
|
|
|
|
cache_handle = block_cache->Lookup(key);
|
|
|
|
if (cache_handle != NULL) {
|
|
|
|
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
|
2012-11-03 05:02:40 +01:00
|
|
|
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_HIT);
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
2012-04-17 17:36:46 +02:00
|
|
|
s = ReadBlock(table->rep_->file, options, handle, &contents);
|
|
|
|
if (s.ok()) {
|
|
|
|
block = new Block(contents);
|
|
|
|
if (contents.cachable && options.fill_cache) {
|
|
|
|
cache_handle = block_cache->Insert(
|
|
|
|
key, block, block->size(), &DeleteCachedBlock);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2012-09-27 10:05:38 +02:00
|
|
|
if (didIO != NULL) {
|
|
|
|
*didIO = true; // we did some io from storage
|
|
|
|
}
|
2012-11-03 05:02:40 +01:00
|
|
|
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_MISS);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
} else {
|
2012-04-17 17:36:46 +02:00
|
|
|
s = ReadBlock(table->rep_->file, options, handle, &contents);
|
|
|
|
if (s.ok()) {
|
|
|
|
block = new Block(contents);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* iter;
|
|
|
|
if (block != NULL) {
|
|
|
|
iter = block->NewIterator(table->rep_->options.comparator);
|
|
|
|
if (cache_handle == NULL) {
|
|
|
|
iter->RegisterCleanup(&DeleteBlock, block, NULL);
|
|
|
|
} else {
|
|
|
|
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
iter = NewErrorIterator(s);
|
|
|
|
}
|
|
|
|
return iter;
|
|
|
|
}
|
|
|
|
|
2012-09-27 10:05:38 +02:00
|
|
|
Iterator* Table::BlockReader(void* arg,
|
|
|
|
const ReadOptions& options,
|
|
|
|
const Slice& index_value) {
|
|
|
|
return BlockReader(arg, options, index_value, NULL);
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Iterator* Table::NewIterator(const ReadOptions& options) const {
|
|
|
|
return NewTwoLevelIterator(
|
|
|
|
rep_->index_block->NewIterator(rep_->options.comparator),
|
|
|
|
&Table::BlockReader, const_cast<Table*>(this), options);
|
|
|
|
}
|
|
|
|
|
2012-04-17 17:36:46 +02:00
|
|
|
Status Table::InternalGet(const ReadOptions& options, const Slice& k,
|
|
|
|
void* arg,
|
2012-09-27 10:05:38 +02:00
|
|
|
void (*saver)(void*, const Slice&, const Slice&, bool)) {
|
2012-04-17 17:36:46 +02:00
|
|
|
Status s;
|
|
|
|
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
|
|
|
|
iiter->Seek(k);
|
|
|
|
if (iiter->Valid()) {
|
|
|
|
Slice handle_value = iiter->value();
|
|
|
|
FilterBlockReader* filter = rep_->filter;
|
|
|
|
BlockHandle handle;
|
|
|
|
if (filter != NULL &&
|
|
|
|
handle.DecodeFrom(&handle_value).ok() &&
|
|
|
|
!filter->KeyMayMatch(handle.offset(), k)) {
|
|
|
|
// Not found
|
2012-11-09 03:18:34 +01:00
|
|
|
RecordTick(rep_->options.statistics, BLOOM_FILTER_USEFUL);
|
2012-04-17 17:36:46 +02:00
|
|
|
} else {
|
2012-09-27 10:05:38 +02:00
|
|
|
bool didIO = false;
|
|
|
|
Iterator* block_iter = BlockReader(this, options, iiter->value(),
|
|
|
|
&didIO);
|
2012-04-17 17:36:46 +02:00
|
|
|
block_iter->Seek(k);
|
|
|
|
if (block_iter->Valid()) {
|
2012-09-27 10:05:38 +02:00
|
|
|
(*saver)(arg, block_iter->key(), block_iter->value(), didIO);
|
2012-04-17 17:36:46 +02:00
|
|
|
}
|
|
|
|
s = block_iter->status();
|
|
|
|
delete block_iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = iiter->status();
|
|
|
|
}
|
|
|
|
delete iiter;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
|
|
|
|
Iterator* index_iter =
|
|
|
|
rep_->index_block->NewIterator(rep_->options.comparator);
|
|
|
|
index_iter->Seek(key);
|
|
|
|
uint64_t result;
|
|
|
|
if (index_iter->Valid()) {
|
|
|
|
BlockHandle handle;
|
|
|
|
Slice input = index_iter->value();
|
|
|
|
Status s = handle.DecodeFrom(&input);
|
|
|
|
if (s.ok()) {
|
|
|
|
result = handle.offset();
|
|
|
|
} else {
|
|
|
|
// Strange: we can't decode the block handle in the index block.
|
|
|
|
// We'll just return the offset of the metaindex block, which is
|
|
|
|
// close to the whole file size for this case.
|
|
|
|
result = rep_->metaindex_handle.offset();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// key is past the last key in the file. Approximate the offset
|
|
|
|
// by returning the offset of the metaindex block (which is
|
|
|
|
// right near the end of the file).
|
|
|
|
result = rep_->metaindex_handle.offset();
|
|
|
|
}
|
|
|
|
delete index_iter;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2011-10-31 18:22:06 +01:00
|
|
|
} // namespace leveldb
|