diff --git a/HISTORY.md b/HISTORY.md index f64d53275..e3095bc5e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ ### Public API changes * Added _LEVEL to all InfoLogLevel enums * Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes +* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter. ### New Features * Column family support diff --git a/db/memtable.cc b/db/memtable.cc index f95ad3c98..45f58b979 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -37,7 +37,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) kWriteBufferSize(options.write_buffer_size), arena_(options.arena_block_size), table_(options.memtable_factory->CreateMemTableRep( - comparator_, &arena_, options.prefix_extractor.get())), + comparator_, &arena_, options.prefix_extractor.get(), + options.info_log.get())), num_entries_(0), flush_in_progress_(false), flush_completed_(false), @@ -55,7 +56,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) prefix_bloom_.reset(new DynamicBloom( options.memtable_prefix_bloom_bits, options.bloom_locality, options.memtable_prefix_bloom_probes, nullptr, - options.memtable_prefix_bloom_huge_page_tlb_size)); + options.memtable_prefix_bloom_huge_page_tlb_size, + options.info_log.get())); } } diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index be15a608c..ac376d747 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -44,6 +44,7 @@ class Arena; class LookupKey; class Slice; class SliceTransform; +class Logger; typedef void* KeyHandle; @@ -174,7 +175,8 @@ class MemTableRepFactory { public: virtual ~MemTableRepFactory() {} virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Arena*, const SliceTransform*) = 0; + Arena*, const SliceTransform*, + Logger* logger) = 0; virtual const char* Name() const = 0; }; @@ -182,8 +184,8 @@ class MemTableRepFactory { class SkipListFactory : public MemTableRepFactory { public: virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Arena*, - const SliceTransform*) override; + Arena*, const SliceTransform*, + Logger* logger) override; virtual const char* Name() const override { return "SkipListFactory"; } }; @@ -201,9 +203,9 @@ class VectorRepFactory : public MemTableRepFactory { public: explicit VectorRepFactory(size_t count = 0) : count_(count) { } - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator&, Arena*, - const SliceTransform*) override; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Arena*, const SliceTransform*, + Logger* logger) override; virtual const char* Name() const override { return "VectorRepFactory"; } diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index f1cb3db47..43daaa9a9 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -272,7 +272,8 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; if (bloom_total_bits > 0) { bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality, - 6, nullptr, huge_page_tlb_size_)); + 6, nullptr, huge_page_tlb_size_, + options_.info_log.get())); } } @@ -328,8 +329,8 @@ void PlainTableReader::FillIndexes( Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", kSubIndexSize); auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize; - char* allocated = - arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_); + char* allocated = arena_.AllocateAligned( + total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); index_ = reinterpret_cast(allocated); sub_index_ = allocated + sizeof(uint32_t) * index_size_; @@ -398,7 +399,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) { uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; if (num_bloom_bits > 0) { bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6, - nullptr, huge_page_tlb_size_)); + nullptr, huge_page_tlb_size_, + options_.info_log.get())); } } diff --git a/util/arena.cc b/util/arena.cc index 0e36bb560..094266a73 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -10,6 +10,7 @@ #include "util/arena.h" #include #include +#include "rocksdb/env.h" namespace rocksdb { @@ -70,20 +71,23 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { } } -char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) { +char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size, + Logger* logger) { assert((kAlignUnit & (kAlignUnit - 1)) == 0); // Pointer size should be a power of 2 #ifdef OS_LINUX if (huge_page_tlb_size > 0 && bytes > 0) { // Allocate from a huge page TBL table. + assert(logger != nullptr); // logger need to be passed in. size_t reserved_size = ((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size; assert(reserved_size >= bytes); void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0); if (addr == MAP_FAILED) { - // TODO(sdong): Better handling + Warn(logger, "AllocateAligned fail to allocate huge TLB pages: %s", + strerror(errno)); // fail back to malloc } else { blocks_memory_ += reserved_size; diff --git a/util/arena.h b/util/arena.h index a4dff495b..161a253aa 100644 --- a/util/arena.h +++ b/util/arena.h @@ -20,6 +20,8 @@ namespace rocksdb { +class Logger; + class Arena { public: // No copying allowed @@ -41,7 +43,12 @@ class Arena { // huge pages for it to be allocated, like: // sysctl -w vm.nr_hugepages=20 // See linux doc Documentation/vm/hugetlbpage.txt for details. - char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0); + // huge page allocation can fail. In this case it will fail back to + // normal cases. The messages will be logged to logger. So when calling with + // huge_page_tlb_size > 0, we highly recommend a logger is passed in. + // Otherwise, the error message will be printed out to stderr directly. + char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0, + Logger* logger = nullptr); // Returns an estimate of the total memory usage of data allocated // by the arena (exclude the space allocated but not yet used for future diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index bc48b9fd3..09ffe71ec 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -22,7 +22,7 @@ static uint32_t BloomHash(const Slice& key) { DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block, uint32_t num_probes, uint32_t (*hash_func)(const Slice& key), - size_t huge_page_tlb_size) + size_t huge_page_tlb_size, Logger* logger) : kBlocked(cl_per_block > 0), kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8), kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock * @@ -40,7 +40,7 @@ DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block, sz += CACHE_LINE_SIZE - 1; } raw_ = reinterpret_cast( - arena_.AllocateAligned(sz, huge_page_tlb_size)); + arena_.AllocateAligned(sz, huge_page_tlb_size, logger)); memset(raw_, 0, sz); if (kBlocked && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { data_ = raw_ + CACHE_LINE_SIZE - diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index f91bb8f91..73476eb3b 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -13,6 +13,7 @@ namespace rocksdb { class Slice; +class Logger; class DynamicBloom { public: @@ -29,7 +30,8 @@ class DynamicBloom { explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0, uint32_t num_probes = 6, uint32_t (*hash_func)(const Slice& key) = nullptr, - size_t huge_page_tlb_size = 0); + size_t huge_page_tlb_size = 0, + Logger* logger = nullptr); ~DynamicBloom() {} diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc index d10bc5d2a..a8864692f 100644 --- a/util/hash_cuckoo_rep.cc +++ b/util/hash_cuckoo_rep.cc @@ -314,7 +314,8 @@ void HashCuckooRep::Insert(KeyHandle handle) { // immutable. if (backup_table_.get() == nullptr) { VectorRepFactory factory(10); - backup_table_.reset(factory.CreateMemTableRep(compare_, arena_, nullptr)); + backup_table_.reset( + factory.CreateMemTableRep(compare_, arena_, nullptr, nullptr)); is_nearly_full_ = true; } backup_table_->Insert(key); @@ -595,7 +596,7 @@ void HashCuckooRep::Iterator::SeekToLast() { MemTableRep* HashCuckooRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform) { + const SliceTransform* transform, Logger* logger) { // The estimated average fullness. The write performance of any close hash // degrades as the fullness of the mem-table increases. Setting kFullness // to a value around 0.7 can better avoid write performance degradation while diff --git a/util/hash_cuckoo_rep.h b/util/hash_cuckoo_rep.h index 8f97ed4e4..669b6b7d4 100644 --- a/util/hash_cuckoo_rep.h +++ b/util/hash_cuckoo_rep.h @@ -29,7 +29,7 @@ class HashCuckooRepFactory : public MemTableRepFactory { virtual MemTableRep* CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform) override; + const SliceTransform* transform, Logger* logger) override; virtual const char* Name() const override { return "HashCuckooRepFactory"; } diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc index acd78c5bb..506f1d8b4 100644 --- a/util/hash_linklist_rep.cc +++ b/util/hash_linklist_rep.cc @@ -54,7 +54,7 @@ class HashLinkListRep : public MemTableRep { public: HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, size_t bucket_size, - size_t huge_page_tlb_size); + size_t huge_page_tlb_size, Logger* logger); virtual KeyHandle Allocate(const size_t len, char** buf) override; @@ -307,13 +307,14 @@ class HashLinkListRep : public MemTableRep { HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, - size_t bucket_size, size_t huge_page_tlb_size) + size_t bucket_size, size_t huge_page_tlb_size, + Logger* logger) : MemTableRep(arena), bucket_size_(bucket_size), transform_(transform), compare_(compare) { char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size, - huge_page_tlb_size); + huge_page_tlb_size, logger); buckets_ = new (mem) port::AtomicPointer[bucket_size]; @@ -469,9 +470,9 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, MemTableRep* HashLinkListRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform) { + const SliceTransform* transform, Logger* logger) { return new HashLinkListRep(compare, arena, transform, bucket_count_, - huge_page_tlb_size_); + huge_page_tlb_size_, logger); } MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count, diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h index 4a9fd0009..bf96e8b0e 100644 --- a/util/hash_linklist_rep.h +++ b/util/hash_linklist_rep.h @@ -23,7 +23,7 @@ class HashLinkListRepFactory : public MemTableRepFactory { virtual MemTableRep* CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform) override; + const SliceTransform* transform, Logger* logger) override; virtual const char* Name() const override { return "HashLinkListRepFactory"; diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index 21df9f62b..1f03874d1 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -320,7 +320,7 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() { MemTableRep* HashSkipListRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform) { + const SliceTransform* transform, Logger* logger) { return new HashSkipListRep(compare, arena, transform, bucket_count_, skiplist_height_, skiplist_branching_factor_); } diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h index 16903c684..6fec60a47 100644 --- a/util/hash_skiplist_rep.h +++ b/util/hash_skiplist_rep.h @@ -27,7 +27,7 @@ class HashSkipListRepFactory : public MemTableRepFactory { virtual MemTableRep* CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform) override; + const SliceTransform* transform, Logger* logger) override; virtual const char* Name() const override { return "HashSkipListRepFactory"; diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index 93f7134c7..f36edf28d 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -116,7 +116,7 @@ public: MemTableRep* SkipListFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform*) { + const SliceTransform*, Logger* logger) { return new SkipListRep(compare, arena); } diff --git a/util/vectorrep.cc b/util/vectorrep.cc index c7f9cca2a..00e5c7450 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -275,7 +275,7 @@ MemTableRep::Iterator* VectorRep::GetIterator() { MemTableRep* VectorRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform*) { + const SliceTransform*, Logger* logger) { return new VectorRep(compare, arena, count_); } } // namespace rocksdb