From 284c365b77834cd250b3c73cc525d6f05aecd9ab Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 1 Apr 2014 18:36:18 -0700 Subject: [PATCH 1/5] Fix valgrind error caused by FileMetaData as two level iterator's index block handle Summary: It is a regression valgrind bug caused by using FileMetaData as index block handle. One of the fields of FileMetaData is not initialized after being contructed and copied, but I'm not able to find which one. Also, I realized that it's not a good idea to use FileMetaData as in TwoLevelIterator::InitDataBlock(), a copied FileMetaData can be compared with the one in version set byte by byte, but the refs can be changed. Also comparing such a large structure is slightly more expensive. Use a simpler structure instead Test Plan: Run the failing valgrind test (Harness.RandomizedLongDB) make all check Reviewers: igor, haobo, ljin Reviewed By: igor CC: yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D17409 --- db/table_cache.h | 2 ++ db/version_set.cc | 45 +++++++++++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/db/table_cache.h b/db/table_cache.h index 42dee2f0f..5f1c29ea5 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -25,6 +25,8 @@ namespace rocksdb { class Env; struct FileMetaData; +// TODO(sdong): try to come up with a better API to pass the file information +// other than simply passing FileMetaData. class TableCache { public: TableCache(const std::string& dbname, const Options* options, diff --git a/db/version_set.cc b/db/version_set.cc index 77275bdd8..2057d6dd4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -140,6 +140,18 @@ bool SomeFileOverlapsRange( return !BeforeFile(ucmp, largest_user_key, files[index]); } +namespace { +// Used for LevelFileNumIterator to pass "block handle" value, +// which actually means file information in this iterator. +// It contains subset of fields of FileMetaData, that is sufficient +// for table cache to use. +struct EncodedFileMetaData { + uint64_t number; // file number + uint64_t file_size; // file size + Cache::Handle* table_reader_handle; // cached table reader's handler +}; +} // namespace + // An internal iterator. For a given version/level pair, yields // information about the files in the level. For a given entry, key() // is the largest key that occurs in the file, and value() is an @@ -181,14 +193,19 @@ class Version::LevelFileNumIterator : public Iterator { } Slice value() const { assert(Valid()); - return Slice(reinterpret_cast((*flist_)[index_]), - sizeof(FileMetaData)); + auto* file_meta = (*flist_)[index_]; + current_value_.number = file_meta->number; + current_value_.file_size = file_meta->file_size; + current_value_.table_reader_handle = file_meta->table_reader_handle; + return Slice(reinterpret_cast(¤t_value_), + sizeof(EncodedFileMetaData)); } virtual Status status() const { return Status::OK(); } private: const InternalKeyComparator icmp_; const std::vector* const flist_; uint32_t index_; + mutable EncodedFileMetaData current_value_; }; static Iterator* GetFileIterator(void* arg, const ReadOptions& options, @@ -196,7 +213,7 @@ static Iterator* GetFileIterator(void* arg, const ReadOptions& options, const InternalKeyComparator& icomparator, const Slice& file_value, bool for_compaction) { TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != sizeof(FileMetaData)) { + if (file_value.size() != sizeof(EncodedFileMetaData)) { return NewErrorIterator( Status::Corruption("FileReader invoked with unexpected value")); } else { @@ -208,11 +225,13 @@ static Iterator* GetFileIterator(void* arg, const ReadOptions& options, options_copy.prefix = nullptr; } - const FileMetaData* meta_file = - reinterpret_cast(file_value.data()); + const EncodedFileMetaData* encoded_meta = + reinterpret_cast(file_value.data()); + FileMetaData meta(encoded_meta->number, encoded_meta->file_size); + meta.table_reader_handle = encoded_meta->table_reader_handle; return cache->NewIterator( - options.prefix ? options_copy : options, soptions, icomparator, - *meta_file, nullptr /* don't need reference to table*/, for_compaction); + options.prefix ? options_copy : options, soptions, icomparator, meta, + nullptr /* don't need reference to table*/, for_compaction); } } @@ -231,11 +250,13 @@ bool Version::PrefixMayMatch(const ReadOptions& options, // key() will always be the biggest value for this SST? may_match = true; } else { - const FileMetaData* meta_file = - reinterpret_cast(level_iter->value().data()); - - may_match = vset_->table_cache_->PrefixMayMatch( - options, vset_->icmp_, *meta_file, internal_prefix, nullptr); + const EncodedFileMetaData* encoded_meta = + reinterpret_cast( + level_iter->value().data()); + FileMetaData meta(encoded_meta->number, encoded_meta->file_size); + meta.table_reader_handle = encoded_meta->table_reader_handle; + may_match = vset_->table_cache_->PrefixMayMatch(options, vset_->icmp_, meta, + internal_prefix, nullptr); } return may_match; } From 4af1954fd60eae50be2deb3ccb020f56eef4b58f Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 2 Apr 2014 14:48:53 -0700 Subject: [PATCH 2/5] Compaction Filter V1 to use old context struct to keep backward compatible Summary: The previous change D15087 changed existing compaction filter, which makes the commonly used class not backward compatible. Revert the older interface. Use a new interface for V2 instead. Test Plan: make all check Reviewers: haobo, yhchiang, igor CC: danguo, dhruba, ljin, igor, leveldb Differential Revision: https://reviews.facebook.net/D17223 --- HISTORY.md | 3 +-- db/db_impl.cc | 10 +++++++++- db/db_test.cc | 6 +++--- include/rocksdb/compaction_filter.h | 15 ++++++++++++--- utilities/ttl/db_ttl.h | 2 +- utilities/ttl/ttl_test.cc | 5 ++--- 6 files changed, 28 insertions(+), 13 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 44ff73632..0946d441e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,13 +15,12 @@ * Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools * Added a command "checkconsistency" in ldb tool, which checks if file system state matches DB state (file existence and file sizes) -* CompactionFilter::Context is now CompactionFilterContext. It is shared by CompactionFilter and CompactionFilterV2 ### New Features * If we find one truncated record at the end of the MANIFEST or WAL files, we will ignore it. We assume that writers of these records were interrupted and that we can safely ignore it. -* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. +* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. * Geo-spatial support for locations and radial-search. ## 2.7.0 (01/28/2014) diff --git a/db/db_impl.cc b/db/db_impl.cc index e532d5b17..c6481ce9c 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -117,6 +117,14 @@ struct DBImpl::CompactionState { total_bytes(0) { } + // Create a client visible context of this compaction + CompactionFilter::Context GetFilterContextV1() { + CompactionFilter::Context context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + // Create a client visible context of this compaction CompactionFilterContext GetFilterContext() { CompactionFilterContext context; @@ -2545,7 +2553,7 @@ Status DBImpl::ProcessKeyValueCompaction( auto compaction_filter = options_.compaction_filter; std::unique_ptr compaction_filter_from_factory = nullptr; if (!compaction_filter) { - auto context = compact->GetFilterContext(); + auto context = compact->GetFilterContextV1(); compaction_filter_from_factory = options_.compaction_filter_factory->CreateCompactionFilter(context); compaction_filter = compaction_filter_from_factory.get(); diff --git a/db/db_test.cc b/db/db_test.cc index 1e8846376..0695b5cc7 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2483,7 +2483,7 @@ class KeepFilterFactory : public CompactionFilterFactory { : check_context_(check_context) {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilterContext& context) override { + const CompactionFilter::Context& context) override { if (check_context_) { ASSERT_EQ(expect_full_compaction_.load(), context.is_full_compaction); ASSERT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); @@ -2500,7 +2500,7 @@ class KeepFilterFactory : public CompactionFilterFactory { class DeleteFilterFactory : public CompactionFilterFactory { public: virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilterContext& context) override { + const CompactionFilter::Context& context) override { if (context.is_manual_compaction) { return std::unique_ptr(new DeleteFilter()); } else { @@ -2516,7 +2516,7 @@ class ChangeFilterFactory : public CompactionFilterFactory { explicit ChangeFilterFactory() {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilterContext& context) override { + const CompactionFilter::Context& context) override { return std::unique_ptr(new ChangeFilter()); } diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index f54ee620c..59b050923 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -31,6 +31,15 @@ struct CompactionFilterContext { class CompactionFilter { public: + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + }; + virtual ~CompactionFilter() {} // The compaction process invokes this @@ -105,7 +114,7 @@ class CompactionFilterFactory { virtual ~CompactionFilterFactory() { } virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilterContext& context) = 0; + const CompactionFilter::Context& context) = 0; // Returns a name that identifies this compaction filter factory. virtual const char* Name() const = 0; @@ -115,8 +124,8 @@ class CompactionFilterFactory { // return any filter class DefaultCompactionFilterFactory : public CompactionFilterFactory { public: - virtual std::unique_ptr - CreateCompactionFilter(const CompactionFilterContext& context) override { + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { return std::unique_ptr(nullptr); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index d23ef88cf..519ae32c7 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -192,7 +192,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory { user_comp_filter_factory_(comp_filter_factory) { } virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilterContext& context) { + const CompactionFilter::Context& context) { return std::unique_ptr( new TtlCompactionFilter( ttl_, diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index a981cceb8..789128729 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -283,9 +283,8 @@ class TtlTest { kNewValue_(kNewValue) { } - virtual std::unique_ptr - CreateCompactionFilter( - const CompactionFilterContext& context) override { + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { return std::unique_ptr( new TestFilter(kSampleSize_, kNewValue_)); } From 56ca75e89ee3e89510fcf92cc58177cbd05566e8 Mon Sep 17 00:00:00 2001 From: Albert Strasheim Date: Wed, 2 Apr 2014 15:15:57 -0700 Subject: [PATCH 3/5] crc32: build a whole special Extend function for SSE 4.2. Disassembling the Extend function shows something that looks much more healthy now. The SSE 4.2 instructions are right there in the body of the function. Intel(R) Core(TM) i7-3540M CPU @ 3.00GHz Before: crc32c: 1.305 micros/op 766260 ops/sec; 2993.2 MB/s (4K per op) After: crc32c: 0.442 micros/op 2263843 ops/sec; 8843.1 MB/s (4K per op) --- util/crc32c.cc | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/util/crc32c.cc b/util/crc32c.cc index 04312d6f6..50178ae71 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -334,19 +334,8 @@ static bool isSSE42() { #endif } -typedef void (*Function)(uint64_t*, uint8_t const**); - -static inline Function Choose_CRC32() { - return isSSE42() ? Fast_CRC32 : Slow_CRC32; -} - -static Function func = Choose_CRC32(); - -static inline void CRC32(uint64_t* l, uint8_t const **p) { - func(l, p); -} - -uint32_t Extend(uint32_t crc, const char* buf, size_t size) { +template +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; uint64_t l = crc ^ 0xffffffffu; @@ -388,5 +377,17 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) { return l ^ 0xffffffffu; } +typedef uint32_t (*Function)(uint32_t, const char*, size_t); + +static inline Function Choose_Extend() { + return isSSE42() ? ExtendImpl : ExtendImpl; +} + +Function ChosenExtend = Choose_Extend(); + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} + } // namespace crc32c } // namespace rocksdb From 158845ba9af59228ea08051848426deb532380b6 Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 2 Apr 2014 16:34:19 -0700 Subject: [PATCH 4/5] Move a info logging out of DB Mutex Summary: As we know, logging can be slow, or even hang for some file systems. Move one more logging out of DB mutex. Test Plan: make all check Reviewers: haobo, igor, ljin Reviewed By: igor CC: yhchiang, nkg-, leveldb Differential Revision: https://reviews.facebook.net/D17427 --- db/db_impl.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index c6481ce9c..bb1f839a9 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -4034,6 +4034,9 @@ Status DBImpl::MakeRoomForWrite(bool force, new_mem = new MemTable(internal_comparator_, options_); new_superversion = new SuperVersion(); } + Log(options_.info_log, + "New memtable created with log file: #%lu\n", + (unsigned long)new_log_number); } mutex_.Lock(); if (!s.ok()) { @@ -4051,9 +4054,6 @@ Status DBImpl::MakeRoomForWrite(bool force, } mem_ = new_mem; mem_->Ref(); - Log(options_.info_log, - "New memtable created with log file: #%lu\n", - (unsigned long)logfile_number_); mem_->SetLogNumber(logfile_number_); force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); From d1d19f5db3ccece60f76fe19fa06f72eb47aff6a Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Wed, 2 Apr 2014 17:24:30 -0700 Subject: [PATCH 5/5] Fix valgrind error in c_test --- db/c_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/db/c_test.c b/db/c_test.c index d8fa8eddb..4a7957b14 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -439,7 +439,8 @@ int main(int argc, char** argv) { rocksdb_close(db); rocksdb_destroy_db(options, dbname, &err); - rocksdb_options_set_filter_policy(options, rocksdb_filterpolicy_create_bloom(10)); + rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10); + rocksdb_options_set_filter_policy(options, policy); rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4); @@ -477,6 +478,7 @@ int main(int argc, char** argv) { rocksdb_iter_get_error(iter, &err); CheckNoError(err); rocksdb_iter_destroy(iter); + rocksdb_filterpolicy_destroy(policy); } StartPhase("cleanup");