rocksdb/db/file_indexer.h

143 lines
6.1 KiB
C
Raw Permalink Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <cstdint>
#include <functional>
#include <limits>
#include <vector>
#include "memory/arena.h"
#include "port/port.h"
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
#include "util/autovector.h"
namespace ROCKSDB_NAMESPACE {
class Comparator;
struct FileMetaData;
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
struct FdWithKeyRange;
struct FileLevel;
// The file tree structure in Version is prebuilt and the range of each file
// is known. On Version::Get(), it uses binary search to find a potential file
// and then check if a target key can be found in the file by comparing the key
2015-12-10 17:54:48 +01:00
// to each file's smallest and largest key. The results of these comparisons
// can be reused beyond checking if a key falls into a file's range.
2015-12-10 17:54:48 +01:00
// With some pre-calculated knowledge, each key comparison that has been done
// can serve as a hint to narrow down further searches: if a key compared to
// be smaller than a file's smallest or largest, that comparison can be used
// to find out the right bound of next binary search. Similarly, if a key
// compared to be larger than a file's smallest or largest, it can be utilized
// to find out the left bound of next binary search.
// With these hints: it can greatly reduce the range of binary search,
// especially for bottom levels, given that one file most likely overlaps with
// only N files from level below (where N is max_bytes_for_level_multiplier).
// So on level L, we will only look at ~N files instead of N^L files on the
// naive approach.
class FileIndexer {
public:
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
explicit FileIndexer(const Comparator* ucmp);
size_t NumLevelIndex() const;
size_t LevelIndexSize(size_t level) const;
// Return a file index range in the next level to search for a key based on
2015-12-10 17:54:48 +01:00
// smallest and largest key comparison for the current file specified by
// level and file_index. When *left_index < *right_index, both index should
// be valid and fit in the vector size.
void GetNextLevelIndex(const size_t level, const size_t file_index,
const int cmp_smallest, const int cmp_largest,
int32_t* left_bound, int32_t* right_bound) const;
void UpdateIndex(Arena* arena, const size_t num_levels,
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
std::vector<FileMetaData*>* const files);
enum {
// MSVC version 1800 still does not have constexpr for ::max()
kLevelMaxIndex = ROCKSDB_NAMESPACE::port::kMaxInt32
};
private:
size_t num_levels_;
const Comparator* ucmp_;
struct IndexUnit {
IndexUnit()
: smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
// During file search, a key is compared against smallest and largest
// from a FileMetaData. It can have 3 possible outcomes:
// (1) key is smaller than smallest, implying it is also smaller than
// larger. Precalculated index based on "smallest < smallest" can
// be used to provide right bound.
// (2) key is in between smallest and largest.
// Precalculated index based on "smallest > greatest" can be used to
// provide left bound.
// Precalculated index based on "largest < smallest" can be used to
// provide right bound.
// (3) key is larger than largest, implying it is also larger than smallest.
// Precalculated index based on "largest > largest" can be used to
// provide left bound.
//
// As a result, we will need to do:
// Compare smallest (<=) and largest keys from upper level file with
// smallest key from lower level to get a right bound.
// Compare smallest (>=) and largest keys from upper level file with
// largest key from lower level to get a left bound.
//
// Example:
// level 1: [50 - 60]
// level 2: [1 - 40], [45 - 55], [58 - 80]
// A key 35, compared to be less than 50, 3rd file on level 2 can be
// skipped according to rule (1). LB = 0, RB = 1.
// A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
// skipped according to rule (2)-a, but the 3rd file cannot be skipped
// because 60 is greater than 58. LB = 1, RB = 2.
// A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
// according to rule (3). LB = 2, RB = 2.
//
// Point to a left most file in a lower level that may contain a key,
// which compares greater than smallest of a FileMetaData (upper level)
int32_t smallest_lb;
// Point to a left most file in a lower level that may contain a key,
// which compares greater than largest of a FileMetaData (upper level)
int32_t largest_lb;
// Point to a right most file in a lower level that may contain a key,
// which compares smaller than smallest of a FileMetaData (upper level)
int32_t smallest_rb;
// Point to a right most file in a lower level that may contain a key,
// which compares smaller than largest of a FileMetaData (upper level)
int32_t largest_rb;
};
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
// Data structure to store IndexUnits in a whole level
struct IndexLevel {
size_t num_index;
IndexUnit* index_units;
IndexLevel() : num_index(0), index_units(nullptr) {}
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
};
void CalculateLB(
const std::vector<FileMetaData*>& upper_files,
const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index);
void CalculateRB(
const std::vector<FileMetaData*>& upper_files,
const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index);
store file_indexer info in sequential memory Summary: use arena to allocate space for next_level_index_ and level_rb_ Thus increasing data locality and make Version::Get faster. Benchmark detail Base version: commit d2a727c182338514af955bbcb1d92db4af83b41c command used: ./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=2097152 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=fillseq, readrandom,readrandom,readrandom --use_existing_db=0 --num=52428800 --threads=1 Result: cpu running percentage: Version::Get, improved from 7.98% to 7.42% FileIndexer::GetNextLevelIndex, improved from 1.18% to 0.68%. Test Plan: make all check Reviewers: ljin, haobo, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, igor Differential Revision: https://reviews.facebook.net/D19845
2014-07-16 20:21:30 +02:00
autovector<IndexLevel> next_level_index_;
int32_t* level_rb_;
};
} // namespace ROCKSDB_NAMESPACE