Expose the table properties to application

Summary: Provide a public API for users to access the table properties for each SSTable.

Test Plan: Added a unit tests to test the function correctness under differnet conditions.

Reviewers: haobo, dhruba, sdong

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D16083
This commit is contained in:
kailiu 2014-02-13 16:28:21 -08:00
parent b2e7ee8b41
commit 63690625cd
12 changed files with 272 additions and 65 deletions

View File

@ -3488,6 +3488,23 @@ Status DBImpl::MakeRoomForWrite(bool force,
return s;
}
Status DBImpl::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
// Increment the ref count
mutex_.Lock();
auto version = versions_->current();
version->Ref();
mutex_.Unlock();
auto s = version->GetPropertiesOfAllTables(props);
// Decrement the ref count
mutex_.Lock();
version->Unref();
mutex_.Unlock();
return s;
}
const std::string& DBImpl::GetName() const {
return dbname_;
}

View File

@ -515,6 +515,9 @@ class DBImpl : public DB {
// deletion_state which can have new_superversion already allocated.
void InstallSuperVersion(DeletionState& deletion_state);
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props)
override;
// Function that Get and KeyMayExist call with no_io true or false
// Note: 'value_found' from KeyMayExist propagates here
Status GetImpl(const ReadOptions& options,

View File

@ -10,6 +10,7 @@
#include <algorithm>
#include <set>
#include <unistd.h>
#include <unordered_set>
#include "db/dbformat.h"
#include "db/db_impl.h"
@ -26,6 +27,7 @@
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/block_based_table_factory.h"
#include "util/hash.h"
#include "util/hash_linklist_rep.h"
@ -832,6 +834,28 @@ static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
return options.statistics->getTickerCount(ticker_type);
}
// A helper function that ensures the table properties returned in
// `GetPropertiesOfAllTablesTest` is correct.
// This test assumes entries size is differnt for each of the tables.
void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
TablePropertiesCollection props;
ASSERT_OK(db->GetPropertiesOfAllTables(&props));
assert(props.size() == 4);
ASSERT_EQ(4, props.size());
std::unordered_set<uint64_t> unique_entries;
// Indirect test
uint64_t sum = 0;
for (const auto& item : props) {
unique_entries.insert(item.second->num_entries);
sum += item.second->num_entries;
}
ASSERT_EQ(props.size(), unique_entries.size());
ASSERT_EQ(expected_entries_size, sum);
}
TEST(DBTest, Empty) {
do {
Options options = CurrentOptions();
@ -920,6 +944,41 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
}
TEST(DBTest, GetPropertiesOfAllTablesTest) {
Options options = CurrentOptions();
Reopen(&options);
// Create 4 tables
for (int table = 0; table < 4; ++table) {
for (int i = 0; i < 10 + table; ++i) {
db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val");
}
db_->Flush(FlushOptions());
}
// 1. Read table properties directly from file
Reopen(&options);
VerifyTableProperties(db_, 10 + 11 + 12 + 13);
// 2. Put two tables to table cache and
Reopen(&options);
// fetch key from 1st and 2nd table, which will internally place that table to
// the table cache.
for (int i = 0; i < 2; ++i) {
Get(std::to_string(i * 100 + 0));
}
VerifyTableProperties(db_, 10 + 11 + 12 + 13);
// 3. Put all tables to table cache
Reopen(&options);
// fetch key from 1st and 2nd table, which will internally place that table to
// the table cache.
for (int i = 0; i < 4; ++i) {
Get(std::to_string(i * 100 + 0));
}
VerifyTableProperties(db_, 10 + 11 + 12 + 13);
}
TEST(DBTest, LevelLimitReopen) {
Options options = CurrentOptions();
Reopen(&options);
@ -4823,6 +4882,9 @@ class ModelDB: public DB {
Status::NotSupported("Not implemented."));
return s;
}
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
return Status();
}
virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key,
std::string* value,

View File

@ -163,6 +163,32 @@ Status TableCache::Get(const ReadOptions& options,
}
return s;
}
Status TableCache::GetTableProperties(
const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr<const TableProperties>* properties, bool no_io) {
Status s;
auto table_handle = file_meta.table_reader_handle;
// table already been pre-loaded?
if (table_handle) {
auto table = GetTableReaderFromHandle(table_handle);
*properties = table->GetTableProperties();
return s;
}
bool table_io;
s = FindTable(toptions, internal_comparator, file_meta.number,
file_meta.file_size, &table_handle, &table_io, no_io);
if (!s.ok()) {
return s;
}
assert(table_handle);
auto table = GetTableReaderFromHandle(table_handle);
*properties = table->GetTableProperties();
ReleaseHandle(table_handle);
return s;
}
bool TableCache::PrefixMayMatch(const ReadOptions& options,
const InternalKeyComparator& icomparator,

View File

@ -73,6 +73,18 @@ class TableCache {
// Get TableReader from a cache handle.
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
// Get the table properties of a given table.
// @no_io: indicates if we should load table to the cache if it is not present
// in table cache yet.
// @returns: `properties` will be reset on success. Please note that we will
// return Status::Incomplete() if table is not present in cache and
// we set `no_io` to be true.
Status GetTableProperties(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr<const TableProperties>* properties,
bool no_io = false);
// Release the handle from a cache
void ReleaseHandle(Cache::Handle* handle);

View File

@ -25,6 +25,8 @@
#include "table/table_reader.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "table/format.h"
#include "table/meta_blocks.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/stop_watch.h"
@ -238,6 +240,59 @@ bool Version::PrefixMayMatch(const ReadOptions& options,
return may_match;
}
Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
auto table_cache = vset_->table_cache_;
auto options = vset_->options_;
for (int level = 0; level < num_levels_; level++) {
for (const auto& file_meta : files_[level]) {
auto fname = TableFileName(vset_->dbname_, file_meta->number);
// 1. If the table is already present in table cache, load table
// properties from there.
std::shared_ptr<const TableProperties> table_properties;
Status s = table_cache->GetTableProperties(
vset_->storage_options_, vset_->icmp_, *file_meta, &table_properties,
true /* no io */);
if (s.ok()) {
props->insert({fname, table_properties});
continue;
}
// We only ignore error type `Incomplete` since it's by design that we
// disallow table when it's not in table cache.
if (!s.IsIncomplete()) {
return s;
}
// 2. Table is not present in table cache, we'll read the table properties
// directly from the properties block in the file.
std::unique_ptr<RandomAccessFile> file;
s = vset_->env_->NewRandomAccessFile(fname, &file,
vset_->storage_options_);
if (!s.ok()) {
return s;
}
TableProperties* raw_table_properties;
// By setting the magic number to kInvalidTableMagicNumber, we can by
// pass the magic number check in the footer.
s = ReadTableProperties(
file.get(), file_meta->file_size,
Footer::kInvalidTableMagicNumber /* table's magic number */,
vset_->env_, options->info_log.get(), &raw_table_properties);
if (!s.ok()) {
return s;
}
RecordTick(options->statistics.get(),
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
props->insert({fname, std::shared_ptr<const TableProperties>(
raw_table_properties)});
}
}
return Status::OK();
}
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
const EnvOptions& soptions,
int level) const {

View File

@ -190,6 +190,12 @@ class Version {
// Returns the version nuber of this version
uint64_t GetVersionNumber() const { return version_number_; }
// REQUIRES: lock is held
// On success, *props will be populated with all SSTables' table properties.
// The keys of `props` are the sst file name, the values of `props` are the
// tables' propertis, represented as shared_ptr.
Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
// used to sort files by size
struct Fsize {
int index;

View File

@ -13,6 +13,7 @@
#include <stdio.h>
#include <memory>
#include <vector>
#include <unordered_map>
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
@ -30,6 +31,7 @@ struct Options;
struct ReadOptions;
struct WriteOptions;
struct FlushOptions;
struct TableProperties;
class WriteBatch;
class Env;
@ -61,6 +63,12 @@ struct Range {
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A collections of table properties objects, where
// key: is the table's file name.
// value: the table properties object of the given table.
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
TablePropertiesCollection;
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
@ -309,6 +317,8 @@ class DB {
// be set properly
virtual Status GetDbIdentity(std::string& identity) = 0;
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0;
private:
// No copying allowed
DB(const DB&);

View File

@ -57,9 +57,9 @@ enum Tickers {
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
* There are 3 reasons currently.
*/
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
// Number of keys written to the database via the Put and Write call's
NUMBER_KEYS_WRITTEN,
@ -80,8 +80,7 @@ enum Tickers {
// write throttle because of too many files in L0
STALL_L0_NUM_FILES_MICROS,
RATE_LIMIT_DELAY_MILLIS,
NO_ITERATORS, // number of iterators currently open
NO_ITERATORS, // number of iterators currently open
// Number of MultiGet calls, keys read, and bytes read
NUMBER_MULTIGET_CALLS,
@ -107,77 +106,77 @@ enum Tickers {
// Record the number of calls to GetUpadtesSince. Useful to keep track of
// transaction log iterator refreshes
GET_UPDATES_SINCE_CALLS,
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
WAL_FILE_SYNCED, // Number of times WAL sync is done
WAL_FILE_BYTES, // Number of bytes written to WAL
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
WAL_FILE_SYNCED, // Number of times WAL sync is done
WAL_FILE_BYTES, // Number of bytes written to WAL
// Writes can be processed by requesting thread or by the thread at the
// head of the writers queue.
WRITE_DONE_BY_SELF,
WRITE_DONE_BY_OTHER,
WRITE_WITH_WAL, // Number of Write calls that request WAL
COMPACT_READ_BYTES, // Bytes read during compaction
COMPACT_WRITE_BYTES, // Bytes written during compaction
WRITE_WITH_WAL, // Number of Write calls that request WAL
COMPACT_READ_BYTES, // Bytes read during compaction
COMPACT_WRITE_BYTES, // Bytes written during compaction
// Number of table's properties loaded directly from file, without creating
// table reader object.
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
TICKER_ENUM_MAX
};
// The order of items listed in Tickers should be the same as
// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
{ BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
{ BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
{ BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
{ BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
{ BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
{ BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
{ BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
{ BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
{ MEMTABLE_HIT, "rocksdb.memtable.hit" },
{ MEMTABLE_MISS, "rocksdb.memtable.miss" },
{ COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
{ COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
{ NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
{ NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
{ NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
{ BYTES_WRITTEN, "rocksdb.bytes.written" },
{ BYTES_READ, "rocksdb.bytes.read" },
{ NO_FILE_CLOSES, "rocksdb.no.file.closes" },
{ NO_FILE_OPENS, "rocksdb.no.file.opens" },
{ NO_FILE_ERRORS, "rocksdb.no.file.errors" },
{ STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
{ STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
{ STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
{ RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
{ NO_ITERATORS, "rocksdb.num.iterators" },
{ NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
{ NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
{ NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
{ NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
{ GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
{ BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
{ BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" },
{ WAL_FILE_SYNCED, "rocksdb.wal.synced" },
{ WAL_FILE_BYTES, "rocksdb.wal.bytes" },
{ WRITE_DONE_BY_SELF, "rocksdb.write.self" },
{ WRITE_DONE_BY_OTHER, "rocksdb.write.other" },
{ WRITE_WITH_WAL, "rocksdb.write.wal" },
{ COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" },
{ COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" },
};
{BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
{BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
{BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
{BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
{BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
{BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
{MEMTABLE_MISS, "rocksdb.memtable.miss"},
{COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
{COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
{COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
{NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
{NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
{NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
{BYTES_WRITTEN, "rocksdb.bytes.written"},
{BYTES_READ, "rocksdb.bytes.read"},
{NO_FILE_CLOSES, "rocksdb.no.file.closes"},
{NO_FILE_OPENS, "rocksdb.no.file.opens"},
{NO_FILE_ERRORS, "rocksdb.no.file.errors"},
{STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
{STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
{STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
{RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
{NO_ITERATORS, "rocksdb.num.iterators"},
{NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
{NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
{NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
{SEQUENCE_NUMBER, "rocksdb.sequence.number"},
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
{GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
{BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
{BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
{WAL_FILE_SYNCED, "rocksdb.wal.synced"},
{WAL_FILE_BYTES, "rocksdb.wal.bytes"},
{WRITE_DONE_BY_SELF, "rocksdb.write.self"},
{WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
{WRITE_WITH_WAL, "rocksdb.write.wal"},
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
"rocksdb.number.direct.load.table.properties"}, };
/**
* Keep adding histogram's here.

View File

@ -148,6 +148,10 @@ class StackableDB : public DB {
return db_->GetDbIdentity(identity);
}
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
return db_->GetPropertiesOfAllTables(props);
}
virtual Status GetUpdatesSince(SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter)
override {

View File

@ -270,12 +270,20 @@ Status ReadTableMagicNumber(const std::string& file_path,
uint64_t file_size;
options.env->GetFileSize(file_path, &file_size);
return ReadTableMagicNumber(file.get(), file_size, options, env_options,
table_magic_number);
}
Status ReadTableMagicNumber(RandomAccessFile* file, uint64_t file_size,
const Options& options,
const EnvOptions& env_options,
uint64_t* table_magic_number) {
if (file_size < Footer::kEncodedLength) {
return Status::InvalidArgument("file is too short to be an sstable");
}
Footer footer;
s = ReadFooterFromFile(file.get(), file_size, &footer);
auto s = ReadFooterFromFile(file, file_size, &footer);
if (!s.ok()) {
return s;
}

View File

@ -124,4 +124,9 @@ Status ReadTableMagicNumber(const std::string& file_path,
const Options& options,
const EnvOptions& env_options,
uint64_t* table_magic_number);
Status ReadTableMagicNumber(RandomAccessFile* file, uint64_t file_size,
const Options& options,
const EnvOptions& env_options,
uint64_t* table_magic_number);
} // namespace rocksdb