diff --git a/.gitignore b/.gitignore index 4ed05c5e3..ccbb46b03 100644 --- a/.gitignore +++ b/.gitignore @@ -31,8 +31,14 @@ coverage/COVERAGE_REPORT package/ .phutil_module_cache tags + +java/out java/*.log java/include/org_rocksdb_*.h + +.idea/ +*.iml + unity.cc java/crossbuild/.vagrant .vagrant/ diff --git a/Makefile b/Makefile index 2e101bede..d9a8feffa 100644 --- a/Makefile +++ b/Makefile @@ -528,7 +528,11 @@ ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PAT ifeq ($(PLATFORM), OS_MACOSX) ROCKSDBJNILIB = librocksdbjni-osx.jnilib ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ +ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","") + JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin +else + JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ +endif endif libz.a: diff --git a/db/column_family.h b/db/column_family.h index 9c415c2a8..96b08c52e 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -166,7 +166,7 @@ class ColumnFamilyData { bool IsDropped() const { return dropped_; } // thread-safe - int NumberLevels() const { return options_.num_levels; } + int NumberLevels() const { return ioptions_.num_levels; } void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } uint64_t GetLogNumber() const { return log_number_; } diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 84bd95839..63d621c50 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -553,8 +553,8 @@ Compaction* UniversalCompactionPicker::PickCompaction( if ((c = PickCompactionUniversalReadAmp( mutable_cf_options, version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { - LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n", - version->cfd_->GetName().c_str()); + LogToBuffer(log_buffer, "[%s] Universal: compacting for file num -- %u\n", + version->cfd_->GetName().c_str(), num_files); } } } diff --git a/db/db_bench.cc b/db/db_bench.cc index 6d611ae1c..f0fe5e02e 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -168,8 +168,9 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." DEFINE_int32(value_size, 100, "Size of each value"); -DEFINE_int32(seekseq_next, 0, "How many times to call Next() after Seek() in " - "fillseekseq"); +DEFINE_int32(seek_nexts, 0, + "How many times to call Next() after Seek() in " + "fillseekseq and seekrandom"); DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); @@ -1231,7 +1232,7 @@ class Benchmark { writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); value_size_ = FLAGS_value_size; key_size_ = FLAGS_key_size; - entries_per_batch_ = 1; + entries_per_batch_ = FLAGS_batch_size; write_options_ = WriteOptions(); if (FLAGS_sync) { write_options_.sync = true; @@ -1286,7 +1287,6 @@ class Benchmark { } else if (name == Slice("readrandomfast")) { method = &Benchmark::ReadRandomFast; } else if (name == Slice("multireadrandom")) { - entries_per_batch_ = FLAGS_batch_size; fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", entries_per_batch_); method = &Benchmark::MultiReadRandom; @@ -2265,6 +2265,7 @@ class Benchmark { std::unique_ptr key_guard(key.data()); Duration duration(FLAGS_duration, reads_); + char value_buffer[256]; while (!duration.Done(1)) { if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) { uint64_t now = FLAGS_env->NowMicros(); @@ -2296,6 +2297,16 @@ class Benchmark { if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { found++; } + + for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) { + // Copy out iterator's value to make sure we read them. + Slice value = iter_to_use->value(); + memcpy(value_buffer, value.data(), + std::min(value.size(), sizeof(value_buffer))); + iter_to_use->Next(); + assert(iter_to_use->status().ok()); + } + thread->stats.FinishedOps(&db_, db_.db, 1); } delete single_iter; @@ -2820,7 +2831,7 @@ class Benchmark { assert(iter->Valid() && iter->key() == key); thread->stats.FinishedOps(nullptr, db, 1); - for (int j = 0; j < FLAGS_seekseq_next && i+1 < FLAGS_num; ++j) { + for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) { iter->Next(); GenerateKeyFromInt(++i, FLAGS_num, &key); assert(iter->Valid() && iter->key() == key); diff --git a/db/db_impl.cc b/db/db_impl.cc index d8df10c5a..334b6df0f 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -282,12 +282,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { namespace { -Status SanitizeDBOptionsByCFOptions( - const DBOptions* db_opts, +Status SanitizeOptionsByTable( + const DBOptions& db_opts, const std::vector& column_families) { Status s; for (auto cf : column_families) { - s = cf.options.table_factory->SanitizeDBOptions(db_opts); + s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); if (!s.ok()) { return s; } @@ -1863,12 +1863,16 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); - return cfh->cfd()->options()->max_mem_compaction_level; + MutexLock l(&mutex_); + return cfh->cfd()->GetSuperVersion()-> + mutable_cf_options.max_mem_compaction_level; } int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); - return cfh->cfd()->options()->level0_stop_writes_trigger; + MutexLock l(&mutex_); + return cfh->cfd()->GetSuperVersion()-> + mutable_cf_options.level0_stop_writes_trigger; } Status DBImpl::Flush(const FlushOptions& flush_options, @@ -3828,16 +3832,16 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, // not supported in lite version return nullptr; #else - auto iter = new ForwardIterator(this, read_options, cfd); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, sv); return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - cfd->options()->max_sequential_skip_in_iterations, - read_options.iterate_upper_bound); + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_options.iterate_upper_bound); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); - SuperVersion* sv = nullptr; - sv = cfd->GetReferencedSuperVersion(&mutex_); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); auto snapshot = read_options.snapshot != nullptr @@ -3889,7 +3893,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), - snapshot, cfd->options()->max_sequential_skip_in_iterations, + snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, read_options.iterate_upper_bound); Iterator* internal_iter = @@ -3908,19 +3912,6 @@ Status DBImpl::NewIterators( std::vector* iterators) { iterators->clear(); iterators->reserve(column_families.size()); - SequenceNumber latest_snapshot = 0; - std::vector super_versions; - super_versions.reserve(column_families.size()); - - if (!read_options.tailing) { - mutex_.Lock(); - latest_snapshot = versions_->LastSequence(); - for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - super_versions.push_back(cfd->GetSuperVersion()->Ref()); - } - mutex_.Unlock(); - } if (read_options.tailing) { #ifdef ROCKSDB_LITE @@ -3929,17 +3920,21 @@ Status DBImpl::NewIterators( #else for (auto cfh : column_families) { auto cfd = reinterpret_cast(cfh)->cfd(); - auto iter = new ForwardIterator(this, read_options, cfd); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, sv); iterators->push_back( NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - cfd->options()->max_sequential_skip_in_iterations)); + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations)); } #endif } else { + SequenceNumber latest_snapshot = versions_->LastSequence(); + for (size_t i = 0; i < column_families.size(); ++i) { - auto cfh = reinterpret_cast(column_families[i]); - auto cfd = cfh->cfd(); + auto* cfd = reinterpret_cast( + column_families[i])->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); auto snapshot = read_options.snapshot != nullptr @@ -3949,9 +3944,9 @@ Status DBImpl::NewIterators( ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, - cfd->options()->max_sequential_skip_in_iterations); + sv->mutable_cf_options.max_sequential_skip_in_iterations); Iterator* internal_iter = NewInternalIterator( - read_options, cfd, super_versions[i], db_iter->GetArena()); + read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } @@ -4129,7 +4124,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { const uint64_t batch_size = WriteBatchInternal::ByteSize(updates); // Record statistics RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count); - RecordTick(stats_, BYTES_WRITTEN, WriteBatchInternal::ByteSize(updates)); + RecordTick(stats_, BYTES_WRITTEN, batch_size); if (write_options.disableWAL) { flush_on_destroy_ = true; } @@ -4179,6 +4174,8 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { // internal stats default_cf_internal_stats_->AddDBStats( InternalStats::BYTES_WRITTEN, batch_size); + default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, + my_batch_count); if (!write_options.disableWAL) { default_cf_internal_stats_->AddDBStats( InternalStats::WAL_FILE_SYNCED, 1); @@ -4542,7 +4539,7 @@ Status DBImpl::DeleteFile(std::string name) { name.c_str()); return Status::InvalidArgument("File not found"); } - assert((level > 0) && (level < cfd->NumberLevels())); + assert(level < cfd->NumberLevels()); // If the file is being compacted no need to delete. if (metadata->being_compacted) { @@ -4561,6 +4558,12 @@ Status DBImpl::DeleteFile(std::string name) { return Status::InvalidArgument("File not in last level"); } } + // if level == 0, it has to be the oldest file + if (level == 0 && + cfd->current()->files_[0].back()->fd.GetNumber() != number) { + return Status::InvalidArgument("File in level 0, but not oldest"); + } + edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, db_directory_.get()); @@ -4703,7 +4706,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { Status DB::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr) { - Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families); + Status s = SanitizeOptionsByTable(db_options, column_families); if (!s.ok()) { return s; } diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 31ebdbedd..c98693d38 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -53,7 +53,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ? reinterpret_cast( read_options.snapshot)->number_ : latest_snapshot), - cfd->options()->max_sequential_skip_in_iterations); + super_version->mutable_cf_options.max_sequential_skip_in_iterations); auto internal_iter = NewInternalIterator( read_options, cfd, super_version, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); @@ -72,16 +72,17 @@ Status DBImplReadOnly::NewIterators( SequenceNumber latest_snapshot = versions_->LastSequence(); for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - auto db_iter = NewArenaWrappedDbIterator( + auto* cfd = reinterpret_cast(cfh)->cfd(); + auto* sv = cfd->GetSuperVersion()->Ref(); + auto* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), (read_options.snapshot != nullptr ? reinterpret_cast( read_options.snapshot)->number_ : latest_snapshot), - cfd->options()->max_sequential_skip_in_iterations); - auto internal_iter = NewInternalIterator( - read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); + sv->mutable_cf_options.max_sequential_skip_in_iterations); + auto* internal_iter = NewInternalIterator( + read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } diff --git a/db/db_test.cc b/db/db_test.cc index 169718387..cfd9dcd9b 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -807,8 +807,8 @@ class DBTest { } std::string AllEntriesFor(const Slice& user_key, int cf = 0) { - ScopedArenaIterator iter; Arena arena; + ScopedArenaIterator iter; if (cf == 0) { iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } else { @@ -5272,7 +5272,7 @@ TEST(DBTest, CompactBetweenSnapshots) { do { Options options = CurrentOptions(); options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); FillLevels("a", "z", 1); @@ -8354,6 +8354,17 @@ TEST(DBTest, TableOptionsSanitizeTest) { options.prefix_extractor.reset(NewNoopTransform()); Destroy(&options); ASSERT_TRUE(TryReopen(&options).IsNotSupported()); + + // Test for check of prefix_extractor when hash index is used for + // block-based table + BlockBasedTableOptions to; + to.index_type = BlockBasedTableOptions::kHashSearch; + options = Options(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(to)); + ASSERT_TRUE(TryReopen(&options).IsInvalidArgument()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + ASSERT_OK(TryReopen(&options)); } TEST(DBTest, DBIteratorBoundTest) { @@ -8691,8 +8702,9 @@ TEST(DBTest, DynamicCompactionOptions) { dbfull()->TEST_WaitForFlushMemTable(); }; - // Write 3 files that have the same key range, trigger compaction and - // result in one L1 file + // Write 3 files that have the same key range. + // Since level0_file_num_compaction_trigger is 3, compaction should be + // triggered. The compaction should result in one L1 file gen_l0_kb(0, 64, 1); ASSERT_EQ(NumTableFilesAtLevel(0), 1); gen_l0_kb(0, 64, 1); @@ -8707,6 +8719,10 @@ TEST(DBTest, DynamicCompactionOptions) { ASSERT_GE(metadata[0].size, k64KB - k4KB); // Test compaction trigger and target_file_size_base + // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. + // Writing to 64KB L0 files should trigger a compaction. Since these + // 2 L0 files have the same key range, compaction merge them and should + // result in 2 32KB L1 files. ASSERT_TRUE(dbfull()->SetOptions({ {"level0_file_num_compaction_trigger", "2"}, {"target_file_size_base", std::to_string(k32KB) } @@ -8722,8 +8738,13 @@ TEST(DBTest, DynamicCompactionOptions) { ASSERT_EQ(2U, metadata.size()); ASSERT_LE(metadata[0].size, k32KB + k4KB); ASSERT_GE(metadata[0].size, k32KB - k4KB); + ASSERT_LE(metadata[1].size, k32KB + k4KB); + ASSERT_GE(metadata[1].size, k32KB - k4KB); // Test max_bytes_for_level_base + // Increase level base size to 256KB and write enough data that will + // fill L1 and L2. L1 size should be around 256KB while L2 size should be + // around 256KB x 4. ASSERT_TRUE(dbfull()->SetOptions({ {"max_bytes_for_level_base", std::to_string(k256KB) } })); @@ -8740,7 +8761,9 @@ TEST(DBTest, DynamicCompactionOptions) { SizeAtLevel(2) < 4 * k256KB * 1.2); // Test max_bytes_for_level_multiplier and - // max_bytes_for_level_base (reduce) + // max_bytes_for_level_base. Now, reduce both mulitplier and level base, + // After filling enough data that can fit in L1 - L3, we should see L1 size + // reduces to 128KB from 256KB which was asserted previously. Same for L2. ASSERT_TRUE(dbfull()->SetOptions({ {"max_bytes_for_level_multiplier", "2"}, {"max_bytes_for_level_base", std::to_string(k128KB) } @@ -8752,14 +8775,14 @@ TEST(DBTest, DynamicCompactionOptions) { gen_l0_kb(i, 64, 32); } dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(SizeAtLevel(1) > k128KB * 0.8 && - SizeAtLevel(1) < k128KB * 1.2); - ASSERT_TRUE(SizeAtLevel(2) > 2 * k128KB * 0.8 && - SizeAtLevel(2) < 2 * k128KB * 1.2); - ASSERT_TRUE(SizeAtLevel(3) > 4 * k128KB * 0.8 && - SizeAtLevel(3) < 4 * k128KB * 1.2); + ASSERT_TRUE(SizeAtLevel(1) < k128KB * 1.2); + ASSERT_TRUE(SizeAtLevel(2) < 2 * k128KB * 1.2); + ASSERT_TRUE(SizeAtLevel(3) < 4 * k128KB * 1.2); - // Clean up memtable and L0 + // Test level0_stop_writes_trigger. + // Clean up memtable and L0. Block compaction threads. If continue to write + // and flush memtables. We should see put timeout after 8 memtable flushes + // since level0_stop_writes_trigger = 8 dbfull()->CompactRange(nullptr, nullptr); // Block compaction SleepingBackgroundTask sleeping_task_low1; @@ -8780,7 +8803,9 @@ TEST(DBTest, DynamicCompactionOptions) { sleeping_task_low1.WakeUp(); sleeping_task_low1.WaitUntilDone(); - // Test: stop trigger (reduce) + // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. + // Block compaction thread again. Perform the put and memtable flushes + // until we see timeout after 6 memtable flushes. ASSERT_TRUE(dbfull()->SetOptions({ {"level0_stop_writes_trigger", "6"} })); @@ -8802,6 +8827,10 @@ TEST(DBTest, DynamicCompactionOptions) { sleeping_task_low2.WaitUntilDone(); // Test disable_auto_compactions + // Compaction thread is unblocked but auto compaction is disabled. Write + // 4 L0 files and compaction should be triggered. If auto compaction is + // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of + // L0 files do not change after the call. ASSERT_TRUE(dbfull()->SetOptions({ {"disable_auto_compactions", "true"} })); @@ -8816,6 +8845,8 @@ TEST(DBTest, DynamicCompactionOptions) { dbfull()->TEST_WaitForCompact(); ASSERT_EQ(NumTableFilesAtLevel(0), 4); + // Enable auto compaction and perform the same test, # of L0 files should be + // reduced after compaction. ASSERT_TRUE(dbfull()->SetOptions({ {"disable_auto_compactions", "false"} })); @@ -8830,8 +8861,10 @@ TEST(DBTest, DynamicCompactionOptions) { dbfull()->TEST_WaitForCompact(); ASSERT_LT(NumTableFilesAtLevel(0), 4); - // Test for hard_rate_limit, change max_bytes_for_level_base to make level - // size big + // Test for hard_rate_limit. + // First change max_bytes_for_level_base to a big value and populate + // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction + // at the same time, we should see some level with score greater than 2. ASSERT_TRUE(dbfull()->SetOptions({ {"max_bytes_for_level_base", std::to_string(k256KB) } })); @@ -8861,7 +8894,9 @@ TEST(DBTest, DynamicCompactionOptions) { SizeAtLevel(2) / k64KB > 4 || SizeAtLevel(3) / k64KB > 8); - // Enfoce hard rate limit, L0 score is not regulated by this limit + // Enfoce hard rate limit. Now set hard_rate_limit to 2, + // we should start to see put delay (1000 us) and timeout as a result + // (L0 score is not regulated by this limit). ASSERT_TRUE(dbfull()->SetOptions({ {"hard_rate_limit", "2"} })); @@ -8873,12 +8908,101 @@ TEST(DBTest, DynamicCompactionOptions) { wo.timeout_hint_us = 500; ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).IsTimedOut()); - // Bump up limit + // Lift the limit and no timeout ASSERT_TRUE(dbfull()->SetOptions({ {"hard_rate_limit", "100"} })); dbfull()->TEST_FlushMemTable(true); ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok()); + + // Test max_mem_compaction_level. + // Destory DB and start from scratch + options.max_background_compactions = 1; + options.max_background_flushes = 0; + options.max_mem_compaction_level = 2; + DestroyAndReopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + + ASSERT_TRUE(Put("max_mem_compaction_level_key", + RandomString(&rnd, 8)).ok()); + dbfull()->TEST_FlushMemTable(true); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + ASSERT_TRUE(Put("max_mem_compaction_level_key", + RandomString(&rnd, 8)).ok()); + // Set new value and it becomes effective in this flush + ASSERT_TRUE(dbfull()->SetOptions({ + {"max_mem_compaction_level", "1"} + })); + dbfull()->TEST_FlushMemTable(true); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + ASSERT_TRUE(Put("max_mem_compaction_level_key", + RandomString(&rnd, 8)).ok()); + // Set new value and it becomes effective in this flush + ASSERT_TRUE(dbfull()->SetOptions({ + {"max_mem_compaction_level", "0"} + })); + dbfull()->TEST_FlushMemTable(true); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); +} + +TEST(DBTest, DynamicMiscOptions) { + // Test max_sequential_skip_in_iterations + Options options; + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 16; + options.compression = kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(&options); + + auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { + int key0 = key_start; + int key1 = key_start + 1; + int key2 = key_start + 2; + Random rnd(301); + ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + } + ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(key1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key1)), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key2)), 0); + ASSERT_EQ(num_reseek, + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + }; + // No reseek + assert_reseek_count(100, 0); + + ASSERT_TRUE(dbfull()->SetOptions({ + {"max_sequential_skip_in_iterations", "4"} + })); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // Trigger reseek + assert_reseek_count(200, 1); + + ASSERT_TRUE(dbfull()->SetOptions({ + {"max_sequential_skip_in_iterations", "16"} + })); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // No reseek + assert_reseek_count(300, 1); } } // namespace rocksdb diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index f1cd4b040..6a6f8e953 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -287,6 +287,75 @@ TEST(DeleteFileTest, DeleteLogFiles) { CloseDB(); } +TEST(DeleteFileTest, DeleteNonDefaultColumnFamily) { + CloseDB(); + DBOptions db_options; + db_options.create_if_missing = true; + db_options.create_missing_column_families = true; + std::vector column_families; + column_families.emplace_back(); + column_families.emplace_back("new_cf", ColumnFamilyOptions()); + + std::vector handles; + rocksdb::DB* db; + ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db)); + + Random rnd(5); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db->Flush(FlushOptions(), handles[1])); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db->Flush(FlushOptions(), handles[1])); + + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_EQ("new_cf", metadata[0].column_family_name); + ASSERT_EQ("new_cf", metadata[1].column_family_name); + auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument()); + ASSERT_OK(db->DeleteFile(old_file)); + + { + std::unique_ptr itr(db->NewIterator(ReadOptions(), handles[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } + + delete handles[0]; + delete handles[1]; + delete db; + + ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db)); + { + std::unique_ptr itr(db->NewIterator(ReadOptions(), handles[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } + + delete handles[0]; + delete handles[1]; + delete db; +} + } //namespace rocksdb int main(int argc, char** argv) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 684045e05..b2e4bd067 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -114,25 +114,29 @@ class LevelIterator : public Iterator { }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, - ColumnFamilyData* cfd) + ColumnFamilyData* cfd, SuperVersion* current_sv) : db_(db), read_options_(read_options), cfd_(cfd), prefix_extractor_(cfd->options()->prefix_extractor.get()), user_comparator_(cfd->user_comparator()), immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), - sv_(nullptr), + sv_(current_sv), mutable_iter_(nullptr), current_(nullptr), valid_(false), is_prev_set_(false), - is_prev_inclusive_(false) {} - -ForwardIterator::~ForwardIterator() { - Cleanup(); + is_prev_inclusive_(false) { + if (sv_) { + RebuildIterators(false); + } } -void ForwardIterator::Cleanup() { +ForwardIterator::~ForwardIterator() { + Cleanup(true); +} + +void ForwardIterator::Cleanup(bool release_sv) { if (mutable_iter_ != nullptr) { mutable_iter_->~Iterator(); } @@ -149,15 +153,17 @@ void ForwardIterator::Cleanup() { } level_iters_.clear(); - if (sv_ != nullptr && sv_->Unref()) { - DBImpl::DeletionState deletion_state; - db_->mutex_.Lock(); - sv_->Cleanup(); - db_->FindObsoleteFiles(deletion_state, false, true); - db_->mutex_.Unlock(); - delete sv_; - if (deletion_state.HaveSomethingToDelete()) { - db_->PurgeObsoleteFiles(deletion_state); + if (release_sv) { + if (sv_ != nullptr && sv_->Unref()) { + DBImpl::DeletionState deletion_state; + db_->mutex_.Lock(); + sv_->Cleanup(); + db_->FindObsoleteFiles(deletion_state, false, true); + db_->mutex_.Unlock(); + delete sv_; + if (deletion_state.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(deletion_state); + } } } } @@ -169,7 +175,7 @@ bool ForwardIterator::Valid() const { void ForwardIterator::SeekToFirst() { if (sv_ == nullptr || sv_ ->version_number != cfd_->GetSuperVersionNumber()) { - RebuildIterators(); + RebuildIterators(true); } else if (status_.IsIncomplete()) { ResetIncompleteIterators(); } @@ -179,7 +185,7 @@ void ForwardIterator::SeekToFirst() { void ForwardIterator::Seek(const Slice& internal_key) { if (sv_ == nullptr || sv_ ->version_number != cfd_->GetSuperVersionNumber()) { - RebuildIterators(); + RebuildIterators(true); } else if (status_.IsIncomplete()) { ResetIncompleteIterators(); } @@ -188,6 +194,7 @@ void ForwardIterator::Seek(const Slice& internal_key) { void ForwardIterator::SeekInternal(const Slice& internal_key, bool seek_to_first) { + assert(mutable_iter_); // mutable seek_to_first ? mutable_iter_->SeekToFirst() : mutable_iter_->Seek(internal_key); @@ -338,7 +345,7 @@ void ForwardIterator::Next() { std::string current_key = key().ToString(); Slice old_key(current_key.data(), current_key.size()); - RebuildIterators(); + RebuildIterators(true); SeekInternal(old_key, false); if (!valid_ || key().compare(old_key) != 0) { return; @@ -412,11 +419,13 @@ Status ForwardIterator::status() const { return Status::OK(); } -void ForwardIterator::RebuildIterators() { +void ForwardIterator::RebuildIterators(bool refresh_sv) { // Clean up - Cleanup(); - // New - sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); + Cleanup(refresh_sv); + if (refresh_sv) { + // New + sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); + } mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); const auto& l0_files = sv_->current->files_[0]; diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 4d3761ee1..537dc1352 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -51,7 +51,7 @@ typedef std::priority_queueappend(buf); // Cumulative uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN]; + uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN]; uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER]; uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF]; uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES]; uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED]; uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL]; // Data + // writes: total number of write requests. + // keys: total number of key updates issued by all the write requests + // batches: number of group commits issued to the DB. Each group can contain + // one or more writes. + // so writes/keys is the average number of put in multi-put or put + // writes/batches is the average group commit size. + // + // The format is the same for interval stats. snprintf(buf, sizeof(buf), - "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " batches, " - "%.1f writes per batch, %.2f GB user ingest\n", - write_other + write_self, write_self, + "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64 + " batches, %.1f writes per batch, %.2f GB user ingest\n", + write_other + write_self, num_keys_written, write_self, (write_other + write_self) / static_cast(write_self + 1), user_bytes_written / kGB); value->append(buf); @@ -318,11 +327,13 @@ void InternalStats::DumpDBStats(std::string* value) { // Interval uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self; + uint64_t interval_num_keys_written = + num_keys_written - db_stats_snapshot_.num_keys_written; snprintf(buf, sizeof(buf), - "Interval writes: %" PRIu64 " writes, %" PRIu64 " batches, " - "%.1f writes per batch, %.1f MB user ingest\n", + "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64 + " batches, %.1f writes per batch, %.1f MB user ingest\n", interval_write_other + interval_write_self, - interval_write_self, + interval_num_keys_written, interval_write_self, static_cast(interval_write_other + interval_write_self) / (interval_write_self + 1), (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB); @@ -347,6 +358,7 @@ void InternalStats::DumpDBStats(std::string* value) { db_stats_snapshot_.ingest_bytes = user_bytes_written; db_stats_snapshot_.write_other = write_other; db_stats_snapshot_.write_self = write_self; + db_stats_snapshot_.num_keys_written = num_keys_written; db_stats_snapshot_.wal_bytes = wal_bytes; db_stats_snapshot_.wal_synced = wal_synced; db_stats_snapshot_.write_with_wal = write_with_wal; diff --git a/db/internal_stats.h b/db/internal_stats.h index 4d12a2512..5caa33415 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -67,6 +67,7 @@ class InternalStats { WAL_FILE_BYTES, WAL_FILE_SYNCED, BYTES_WRITTEN, + NUMBER_KEYS_WRITTEN, WRITE_DONE_BY_OTHER, WRITE_DONE_BY_SELF, WRITE_WITH_WAL, @@ -264,6 +265,11 @@ class InternalStats { // another thread. uint64_t write_other; uint64_t write_self; + // Total number of keys written. write_self and write_other measure number + // of write requests written, Each of the write request can contain updates + // to multiple keys. num_keys_written is total number of keys updated by all + // those writes. + uint64_t num_keys_written; double seconds_up; DBStatsSnapshot() @@ -273,6 +279,7 @@ class InternalStats { write_with_wal(0), write_other(0), write_self(0), + num_keys_written(0), seconds_up(0) {} } db_stats_snapshot_; diff --git a/db/version_edit.cc b/db/version_edit.cc index 4e2cf8f5b..271016aaf 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -293,7 +293,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) { new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { - msg = "new-file2 entry"; + msg = "new-file3 entry"; } } break; diff --git a/db/version_set.cc b/db/version_set.cc index 0819196fb..65c36c715 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -895,7 +895,7 @@ void Version::ComputeCompactionScore( } if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) { score = static_cast(total_size) / - cfd_->options()->compaction_options_fifo.max_table_files_size; + cfd_->ioptions()->compaction_options_fifo.max_table_files_size; } else if (numfiles >= mutable_cf_options.level0_stop_writes_trigger) { // If we are slowing down writes, then we better compact that first score = 1000000; @@ -1051,8 +1051,8 @@ int Version::PickLevelForMemTableOutput( InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); InternalKey limit(largest_user_key, 0, static_cast(0)); std::vector overlaps; - int max_mem_compact_level = cfd_->options()->max_mem_compaction_level; - while (max_mem_compact_level > 0 && level < max_mem_compact_level) { + while (mutable_cf_options.max_mem_compaction_level > 0 && + level < mutable_cf_options.max_mem_compaction_level) { if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 4c06c23f7..4fddab4b3 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -355,11 +355,13 @@ class TableFactory { WritableFile* file, const CompressionType compression_type, const CompressionOptions& compression_opts) const = 0; - // Sanitizes the specified DB Options. + // Sanitizes the specified DB Options and ColumnFamilyOptions. // // If the function cannot find a way to sanitize the input DB Options, // a non-ok Status will be returned. - virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0; + virtual Status SanitizeOptions( + const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const = 0; // Return a string that contains printable format of table configurations. // RocksDB prints configurations at DB Open(). diff --git a/java/Makefile b/java/Makefile index f06cea579..c8bc50290 100644 --- a/java/Makefile +++ b/java/Makefile @@ -1,4 +1,4 @@ -NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator +NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) @@ -42,11 +42,15 @@ test: java java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MemTableTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MergeTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DirectComparatorTest @rm -rf /tmp/rocksdbjni_* db_bench: java diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh index 55f179b62..5730b1533 100755 --- a/java/crossbuild/build-linux-centos.sh +++ b/java/crossbuild/build-linux-centos.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # install all required packages for rocksdb that are available through yum ARCH=$(uname -i) -sudo yum -y install java-1.6.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel +sudo yum -y install java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel # install gcc/g++ 4.7 via CERN (http://linux.web.cern.ch/linux/devtoolset/) sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo @@ -12,7 +12,7 @@ tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-1.1 ./configure export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib # set java home so we can build rocksdb jars -export JAVA_HOME=/usr/lib/jvm/java-1.6.0 +export JAVA_HOME=/usr/lib/jvm/java-1.7.0 # build rocksdb cd /rocksdb diff --git a/java/org/rocksdb/AbstractComparator.java b/java/org/rocksdb/AbstractComparator.java new file mode 100644 index 000000000..8de50e271 --- /dev/null +++ b/java/org/rocksdb/AbstractComparator.java @@ -0,0 +1,95 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Comparators are used by RocksDB to determine + * the ordering of keys. + * + * This class is package private, implementers + * should extend either of the public abstract classes: + * @see org.rocksdb.Comparator + * @see org.rocksdb.DirectComparator + */ +public abstract class AbstractComparator + extends RocksObject { + + /** + * The name of the comparator. Used to check for comparator + * mismatches (i.e., a DB created with one comparator is + * accessed using a different comparator). + * + * A new name should be used whenever + * the comparator implementation changes in a way that will cause + * the relative ordering of any two keys to change. + * + * Names starting with "rocksdb." are reserved and should not be used. + * + * @return The name of this comparator implementation + */ + public abstract String name(); + + /** + * Three-way key comparison + * + * @param a Slice access to first key + * @param b Slice access to second key + * + * @return Should return either: + * 1) < 0 if "a" < "b" + * 2) == 0 if "a" == "b" + * 3) > 0 if "a" > "b" + */ + public abstract int compare(final T a, final T b); + + /** + * Used to reduce the space requirements + * for internal data structures like index blocks. + * + * If start < limit, you may return a new start which is a + * shorter string in [start, limit). + * + * Simple comparator implementations may return null if they + * wish to use start unchanged. i.e., an implementation of + * this method that does nothing is correct. + * + * @return a shorter start, or null + */ + public String findShortestSeparator(final String start, final T limit) { + return null; + } + + /** + * Used to reduce the space requirements + * for internal data structures like index blocks. + * + * You may return a new short key (key1) where + * key1 >= key. + * + * Simple comparator implementations may return null if they + * wish to leave the key unchanged. i.e., an implementation of + * this method that does nothing is correct. + * + * @return a shorter key, or null + */ + public String findShortSuccessor(final String key) { + return null; + } + + /** + * Deletes underlying C++ comparator pointer. + * + * Note that this function should be called only after all + * RocksDB instances referencing the comparator are closed. + * Otherwise an undefined behavior will occur. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void disposeInternal(long handle); +} diff --git a/java/org/rocksdb/AbstractSlice.java b/java/org/rocksdb/AbstractSlice.java new file mode 100644 index 000000000..971bd7c1a --- /dev/null +++ b/java/org/rocksdb/AbstractSlice.java @@ -0,0 +1,167 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Slices are used by RocksDB to provide + * efficient access to keys and values. + * + * This class is package private, implementers + * should extend either of the public abstract classes: + * @see org.rocksdb.Slice + * @see org.rocksdb.DirectSlice + * + * Regards the lifecycle of Java Slices in RocksDB: + * At present when you configure a Comparator from Java, it creates an + * instance of a C++ BaseComparatorJniCallback subclass and + * passes that to RocksDB as the comparator. That subclass of + * BaseComparatorJniCallback creates the Java + * {@see org.rocksdb.AbstractSlice} subclass Objects. When you dispose + * the Java {@see org.rocksdb.AbstractComparator} subclass, it disposes the + * C++ BaseComparatorJniCallback subclass, which in turn destroys the + * Java {@see org.rocksdb.AbstractSlice} subclass Objects. + */ +abstract class AbstractSlice extends RocksObject { + + /** + * Returns the data of the slice. + * + * @return The slice data. Note, the type of access is + * determined by the subclass + * @see org.rocksdb.AbstractSlice#data0(long). + */ + public T data() { + assert (isInitialized()); + return data0(nativeHandle_); + } + + /** + * Access to the data is provided by the + * subtype as it needs to handle the + * generic typing. + * + * @param handle The address of the underlying + * native object. + * + * @return Java typed access to the data. + */ + protected abstract T data0(long handle); + + /** + * Return the length (in bytes) of the data. + * + * @return The length in bytes. + */ + public int size() { + assert (isInitialized()); + return size0(nativeHandle_); + } + + /** + * Return true if the length of the + * data is zero. + * + * @return true if there is no data, false otherwise. + */ + public boolean empty() { + assert (isInitialized()); + return empty0(nativeHandle_); + } + + /** + * Creates a string representation of the data + * + * @param hex When true, the representation + * will be encoded in hexadecimal. + * + * @return The string representation of the data. + */ + public String toString(final boolean hex) { + assert (isInitialized()); + return toString0(nativeHandle_, hex); + } + + @Override + public String toString() { + return toString(false); + } + + /** + * Three-way key comparison + * + * @param other A slice to compare against + * + * @return Should return either: + * 1) < 0 if this < other + * 2) == 0 if this == other + * 3) > 0 if this > other + */ + public int compare(final AbstractSlice other) { + assert (other != null); + assert (isInitialized()); + return compare0(nativeHandle_, other.nativeHandle_); + } + + /** + * If other is a slice object, then + * we defer to {@link #compare(AbstractSlice) compare} + * to check equality, otherwise we return false. + * + * @param other Object to test for equality + * + * @return true when {@code this.compare(other) == 0}, + * false otherwise. + */ + @Override + public boolean equals(final Object other) { + if (other != null && other instanceof AbstractSlice) { + return compare((AbstractSlice)other) == 0; + } else { + return false; + } + } + + /** + * Determines whether this slice starts with + * another slice + * + * @param prefix Another slice which may of may not + * be a prefix of this slice. + * + * @return true when this slice starts with the + * {@code prefix} slice + */ + public boolean startsWith(final AbstractSlice prefix) { + if (prefix != null) { + assert (isInitialized()); + return startsWith0(nativeHandle_, prefix.nativeHandle_); + } else { + return false; + } + } + + /** + * Deletes underlying C++ slice pointer. + *

+ * Note that this function should be called only after all + * RocksDB instances referencing the slice are closed. + * Otherwise an undefined behavior will occur. + */ + @Override + protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + protected native void createNewSliceFromString(String str); + private native int size0(long handle); + private native boolean empty0(long handle); + private native String toString0(long handle, boolean hex); + private native int compare0(long handle, long otherHandle); + private native boolean startsWith0(long handle, long otherHandle); + private native void disposeInternal(long handle); + +} diff --git a/java/org/rocksdb/Comparator.java b/java/org/rocksdb/Comparator.java new file mode 100644 index 000000000..c8e050bca --- /dev/null +++ b/java/org/rocksdb/Comparator.java @@ -0,0 +1,24 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Base class for comparators which will receive + * byte[] based access via org.rocksdb.Slice in their + * compare method implementation. + * + * byte[] based slices perform better when small keys + * are involved. When using larger keys consider + * using @see org.rocksdb.DirectComparator + */ +public abstract class Comparator extends AbstractComparator { + public Comparator(final ComparatorOptions copt) { + super(); + createNewComparator0(copt.nativeHandle_); + } + + private native void createNewComparator0(final long comparatorOptionsHandle); +} diff --git a/java/org/rocksdb/ComparatorOptions.java b/java/org/rocksdb/ComparatorOptions.java new file mode 100644 index 000000000..f0ba520a3 --- /dev/null +++ b/java/org/rocksdb/ComparatorOptions.java @@ -0,0 +1,57 @@ +package org.rocksdb; + +/** + * This class controls the behaviour + * of Java implementations of + * AbstractComparator + * + * Note that dispose() must be called before a ComparatorOptions + * instance becomes out-of-scope to release the allocated memory in C++. + */ +public class ComparatorOptions extends RocksObject { + public ComparatorOptions() { + super(); + newComparatorOptions(); + } + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @return true if adaptive mutex is used. + */ + public boolean useAdaptiveMutex() { + assert(isInitialized()); + return useAdaptiveMutex(nativeHandle_); + } + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @param useAdaptiveMutex true if adaptive mutex is used. + * @return the reference to the current comparator options. + */ + public ComparatorOptions setUseAdaptiveMutex(final boolean useAdaptiveMutex) { + assert (isInitialized()); + setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); + return this; + } + + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void newComparatorOptions(); + private native boolean useAdaptiveMutex(final long handle); + private native void setUseAdaptiveMutex(final long handle, + final boolean useAdaptiveMutex); + private native void disposeInternal(long handle); +} diff --git a/java/org/rocksdb/DirectComparator.java b/java/org/rocksdb/DirectComparator.java new file mode 100644 index 000000000..47f4d7256 --- /dev/null +++ b/java/org/rocksdb/DirectComparator.java @@ -0,0 +1,24 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Base class for comparators which will receive + * ByteBuffer based access via org.rocksdb.DirectSlice + * in their compare method implementation. + * + * ByteBuffer based slices perform better when large keys + * are involved. When using smaller keys consider + * using @see org.rocksdb.Comparator + */ +public abstract class DirectComparator extends AbstractComparator { + public DirectComparator(final ComparatorOptions copt) { + super(); + createNewDirectComparator0(copt.nativeHandle_); + } + + private native void createNewDirectComparator0(final long comparatorOptionsHandle); +} diff --git a/java/org/rocksdb/DirectSlice.java b/java/org/rocksdb/DirectSlice.java new file mode 100644 index 000000000..847bbd9c1 --- /dev/null +++ b/java/org/rocksdb/DirectSlice.java @@ -0,0 +1,113 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.nio.ByteBuffer; + +/** + * Base class for slices which will receive direct + * ByteBuffer based access to the underlying data. + * + * ByteBuffer backed slices typically perform better with + * larger keys and values. When using smaller keys and + * values consider using @see org.rocksdb.Slice + */ +public class DirectSlice extends AbstractSlice { + /** + * Called from JNI to construct a new Java DirectSlice + * without an underlying C++ object set + * at creation time. + * + * Note: You should be aware that + * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally + * called from the default DirectSlice constructor, and that it is marked as + * private. This is so that developers cannot construct their own default + * DirectSlice objects (at present). As developers cannot construct their own + * DirectSlice objects through this, they are not creating underlying C++ + * DirectSlice objects, and so there is nothing to free (dispose) from Java. + */ + private DirectSlice() { + super(); + disOwnNativeHandle(); + } + + /** + * Constructs a slice + * where the data is taken from + * a String. + * + * @param str The string + */ + public DirectSlice(final String str) { + super(); + createNewSliceFromString(str); + } + + /** + * Constructs a slice where the data is + * read from the provided + * ByteBuffer up to a certain length + * + * @param data The buffer containing the data + * @param length The length of the data to use for the slice + */ + public DirectSlice(final ByteBuffer data, final int length) { + super(); + createNewDirectSlice0(data, length); + } + + /** + * Constructs a slice where the data is + * read from the provided + * ByteBuffer + * + * @param data The bugger containing the data + */ + public DirectSlice(final ByteBuffer data) { + super(); + createNewDirectSlice1(data); + } + + /** + * Retrieves the byte at a specific offset + * from the underlying data + * + * @param offset The (zero-based) offset of the byte to retrieve + * + * @return the requested byte + */ + public byte get(int offset) { + assert (isInitialized()); + return get0(nativeHandle_, offset); + } + + /** + * Clears the backing slice + */ + public void clear() { + assert (isInitialized()); + clear0(nativeHandle_); + } + + /** + * Drops the specified {@code n} + * number of bytes from the start + * of the backing slice + * + * @param n The number of bytes to drop + */ + public void removePrefix(final int n) { + assert (isInitialized()); + removePrefix0(nativeHandle_, n); + } + + private native void createNewDirectSlice0(ByteBuffer data, int length); + private native void createNewDirectSlice1(ByteBuffer data); + @Override protected final native ByteBuffer data0(long handle); + private native byte get0(long handle, int offset); + private native void clear0(long handle); + private native void removePrefix0(long handle, int length); +} diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/org/rocksdb/HashLinkedListMemTableConfig.java index 381a16f49..78a4e8661 100644 --- a/java/org/rocksdb/HashLinkedListMemTableConfig.java +++ b/java/org/rocksdb/HashLinkedListMemTableConfig.java @@ -15,9 +15,21 @@ package org.rocksdb; */ public class HashLinkedListMemTableConfig extends MemTableConfig { public static final long DEFAULT_BUCKET_COUNT = 50000; + public static final long DEFAULT_HUGE_PAGE_TLB_SIZE = 0; + public static final int DEFAULT_BUCKET_ENTRIES_LOG_THRES = 4096; + public static final boolean + DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH = true; + public static final int DEFAUL_THRESHOLD_USE_SKIPLIST = 256; + /** + * HashLinkedListMemTableConfig constructor + */ public HashLinkedListMemTableConfig() { bucketCount_ = DEFAULT_BUCKET_COUNT; + hugePageTlbSize_ = DEFAULT_HUGE_PAGE_TLB_SIZE; + bucketEntriesLoggingThreshold_ = DEFAULT_BUCKET_ENTRIES_LOG_THRES; + ifLogBucketDistWhenFlush_ = DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH; + thresholdUseSkiplist_ = DEFAUL_THRESHOLD_USE_SKIPLIST; } /** @@ -42,13 +54,119 @@ public class HashLinkedListMemTableConfig extends MemTableConfig { return bucketCount_; } - @Override protected long newMemTableFactoryHandle() - throws RocksDBException { - return newMemTableFactoryHandle(bucketCount_); + /** + *

Set the size of huge tlb or allocate the hashtable bytes from + * malloc if {@code size <= 0}.

+ * + *

The user needs to reserve huge pages for it to be allocated, + * like: {@code sysctl -w vm.nr_hugepages=20}

+ * + *

See linux documentation/vm/hugetlbpage.txt

+ * + * @param size if set to {@code <= 0} hashtable bytes from malloc + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig setHugePageTlbSize(long size) { + hugePageTlbSize_ = size; + return this; } - private native long newMemTableFactoryHandle(long bucketCount) + /** + * Returns the size value of hugePageTlbSize. + * + * @return the hugePageTlbSize. + */ + public long hugePageTlbSize() { + return hugePageTlbSize_; + } + + /** + * If number of entries in one bucket exceeds that setting, log + * about it. + * + * @param threshold - number of entries in a single bucket before + * logging starts. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig + setBucketEntriesLoggingThreshold(int threshold) { + bucketEntriesLoggingThreshold_ = threshold; + return this; + } + + /** + * Returns the maximum number of entries in one bucket before + * logging starts. + * + * @return maximum number of entries in one bucket before logging + * starts. + */ + public int bucketEntriesLoggingThreshold() { + return bucketEntriesLoggingThreshold_; + } + + /** + * If true the distrubition of number of entries will be logged. + * + * @param logDistribution - boolean parameter indicating if number + * of entry distribution shall be logged. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig + setIfLogBucketDistWhenFlush(boolean logDistribution) { + ifLogBucketDistWhenFlush_ = logDistribution; + return this; + } + + /** + * Returns information about logging the distribution of + * number of entries on flush. + * + * @return if distrubtion of number of entries shall be logged. + */ + public boolean ifLogBucketDistWhenFlush() { + return ifLogBucketDistWhenFlush_; + } + + /** + * Set maximum number of entries in one bucket. Exceeding this val + * leads to a switch from LinkedList to SkipList. + * + * @param threshold maximum number of entries before SkipList is + * used. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig + setThresholdUseSkiplist(int threshold) { + thresholdUseSkiplist_ = threshold; + return this; + } + + /** + * Returns entries per bucket threshold before LinkedList is + * replaced by SkipList usage for that bucket. + * + * @return entries per bucket threshold before SkipList is used. + */ + public int thresholdUseSkiplist() { + return thresholdUseSkiplist_; + } + + @Override protected long newMemTableFactoryHandle() + throws RocksDBException { + return newMemTableFactoryHandle(bucketCount_, hugePageTlbSize_, + bucketEntriesLoggingThreshold_, ifLogBucketDistWhenFlush_, + thresholdUseSkiplist_); + } + + private native long newMemTableFactoryHandle(long bucketCount, + long hugePageTlbSize, int bucketEntriesLoggingThreshold, + boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist) throws RocksDBException; private long bucketCount_; + private long hugePageTlbSize_; + private int bucketEntriesLoggingThreshold_; + private boolean ifLogBucketDistWhenFlush_; + private int thresholdUseSkiplist_; } diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/org/rocksdb/HashSkipListMemTableConfig.java index 100f16c82..ad2120f18 100644 --- a/java/org/rocksdb/HashSkipListMemTableConfig.java +++ b/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -18,6 +18,9 @@ public class HashSkipListMemTableConfig extends MemTableConfig { public static final int DEFAULT_BRANCHING_FACTOR = 4; public static final int DEFAULT_HEIGHT = 4; + /** + * HashSkipListMemTableConfig constructor + */ public HashSkipListMemTableConfig() { bucketCount_ = DEFAULT_BUCKET_COUNT; branchingFactor_ = DEFAULT_BRANCHING_FACTOR; diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java index e5eb96374..a3ea38f7c 100644 --- a/java/org/rocksdb/Options.java +++ b/java/org/rocksdb/Options.java @@ -193,6 +193,27 @@ public class Options extends RocksObject { return maxWriteBufferNumber(nativeHandle_); } + /** + * Use the specified comparator for key ordering. + * + * Comparator should not be disposed before options instances using this comparator is + * disposed. If dispose() function is not called, then comparator object will be + * GC'd automatically. + * + * Comparator instance can be re-used in multiple options instances. + * + * @param comparator java instance. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setComparator(AbstractComparator comparator) { + assert (isInitialized()); + setComparatorHandle(nativeHandle_, comparator.nativeHandle_); + comparator_ = comparator; + return this; + } + private native void setComparatorHandle(long optHandle, long comparatorHandle); + /** * If true, an error will be thrown during RocksDB.open() if the * database already exists. @@ -2318,6 +2339,7 @@ public class Options extends RocksObject { long cacheSize_; int numShardBits_; + AbstractComparator comparator_; RocksEnv env_; MemTableConfig memTableConfig_; TableFormatConfig tableFormatConfig_; diff --git a/java/org/rocksdb/SkipListMemTableConfig.java b/java/org/rocksdb/SkipListMemTableConfig.java index 7f9f5cb5f..d26fd9d32 100644 --- a/java/org/rocksdb/SkipListMemTableConfig.java +++ b/java/org/rocksdb/SkipListMemTableConfig.java @@ -4,12 +4,46 @@ package org.rocksdb; * The config for skip-list memtable representation. */ public class SkipListMemTableConfig extends MemTableConfig { + + public static final long DEFAULT_LOOKAHEAD = 0; + + /** + * SkipListMemTableConfig constructor + */ public SkipListMemTableConfig() { + lookahead_ = DEFAULT_LOOKAHEAD; } + /** + * Sets lookahead for SkipList + * + * @param lookahead If non-zero, each iterator's seek operation + * will start the search from the previously visited record + * (doing at most 'lookahead' steps). This is an + * optimization for the access pattern including many + * seeks with consecutive keys. + * @return the current instance of SkipListMemTableConfig + */ + public SkipListMemTableConfig setLookahead(long lookahead) { + lookahead_ = lookahead; + return this; + } + + /** + * Returns the currently set lookahead value. + * + * @return lookahead value + */ + public long lookahead() { + return lookahead_; + } + + @Override protected long newMemTableFactoryHandle() { - return newMemTableFactoryHandle0(); + return newMemTableFactoryHandle0(lookahead_); } - private native long newMemTableFactoryHandle0(); + private native long newMemTableFactoryHandle0(long lookahead); + + private long lookahead_; } diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java new file mode 100644 index 000000000..4449cb7b8 --- /dev/null +++ b/java/org/rocksdb/Slice.java @@ -0,0 +1,84 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Base class for slices which will receive + * byte[] based access to the underlying data. + * + * byte[] backed slices typically perform better with + * small keys and values. When using larger keys and + * values consider using @see org.rocksdb.DirectSlice + */ +public class Slice extends AbstractSlice { + /** + * Called from JNI to construct a new Java Slice + * without an underlying C++ object set + * at creation time. + * + * Note: You should be aware that + * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally + * called from the default Slice constructor, and that it is marked as + * private. This is so that developers cannot construct their own default + * Slice objects (at present). As developers cannot construct their own + * Slice objects through this, they are not creating underlying C++ Slice + * objects, and so there is nothing to free (dispose) from Java. + */ + private Slice() { + super(); + disOwnNativeHandle(); + } + + /** + * Constructs a slice + * where the data is taken from + * a String. + */ + public Slice(final String str) { + super(); + createNewSliceFromString(str); + } + + /** + * Constructs a slice + * where the data is a copy of + * the byte array from a specific offset. + */ + public Slice(final byte[] data, final int offset) { + super(); + createNewSlice0(data, offset); + } + + /** + * Constructs a slice + * where the data is a copy of + * the byte array. + */ + public Slice(final byte[] data) { + super(); + createNewSlice1(data); + } + + /** + * Deletes underlying C++ slice pointer + * and any buffered data. + * + *

+ * Note that this function should be called only after all + * RocksDB instances referencing the slice are closed. + * Otherwise an undefined behavior will occur. + */ + @Override + protected void disposeInternal() { + super.disposeInternal(); + disposeInternalBuf(nativeHandle_); + } + + @Override protected final native byte[] data0(long handle); + private native void createNewSlice0(byte[] data, int length); + private native void createNewSlice1(byte[] data); + private native void disposeInternalBuf(long handle); +} diff --git a/java/org/rocksdb/VectorMemTableConfig.java b/java/org/rocksdb/VectorMemTableConfig.java index b7a413f19..ba1be3e77 100644 --- a/java/org/rocksdb/VectorMemTableConfig.java +++ b/java/org/rocksdb/VectorMemTableConfig.java @@ -5,6 +5,10 @@ package org.rocksdb; */ public class VectorMemTableConfig extends MemTableConfig { public static final int DEFAULT_RESERVED_SIZE = 0; + + /** + * VectorMemTableConfig constructor + */ public VectorMemTableConfig() { reservedSize_ = DEFAULT_RESERVED_SIZE; } diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java new file mode 100644 index 000000000..dfdb3cad9 --- /dev/null +++ b/java/org/rocksdb/test/AbstractComparatorTest.java @@ -0,0 +1,166 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import org.rocksdb.*; + +import java.io.IOException; +import java.nio.file.*; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.Random; + +import static org.rocksdb.test.Types.byteToInt; +import static org.rocksdb.test.Types.intToByte; + +/** + * Abstract tests for both Comparator and DirectComparator + */ +public abstract class AbstractComparatorTest { + + /** + * Get a comparator which will expect Integer keys + * and determine an ascending order + * + * @return An integer ascending order key comparator + */ + public abstract AbstractComparator getAscendingIntKeyComparator(); + + /** + * Test which stores random keys into the database + * using an @see getAscendingIntKeyComparator + * it then checks that these keys are read back in + * ascending order + * + * @param db_path A path where we can store database + * files temporarily + */ + public void testRoundtrip(final Path db_path) throws IOException { + + Options opt = null; + RocksDB db = null; + + try { + opt = new Options(); + opt.setCreateIfMissing(true); + opt.setComparator(getAscendingIntKeyComparator()); + + // store 10,000 random integer keys + final int ITERATIONS = 10000; + + db = RocksDB.open(opt, db_path.toString()); + final Random random = new Random(); + for (int i = 0; i < ITERATIONS; i++) { + final byte key[] = intToByte(random.nextInt()); + if (i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates) + i--; // generate a different key + } else { + db.put(key, "value".getBytes()); + } + } + db.close(); + + + // re-open db and read from start to end + // integer keys should be in ascending + // order as defined by SimpleIntComparator + db = RocksDB.open(opt, db_path.toString()); + final RocksIterator it = db.newIterator(); + it.seekToFirst(); + int lastKey = Integer.MIN_VALUE; + int count = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + final int thisKey = byteToInt(it.key()); + assert(thisKey > lastKey); + lastKey = thisKey; + count++; + } + db.close(); + + assert(count == ITERATIONS); + + } catch (final RocksDBException e) { + System.err.format("[ERROR]: %s%n", e); + e.printStackTrace(); + } finally { + if (db != null) { + db.close(); + } + + if (opt != null) { + opt.dispose(); + } + + removeDb(db_path); // cleanup after ourselves! + } + } + + /** + * Compares integer keys + * so that they are in ascending order + * + * @param a 4-bytes representing an integer key + * @param b 4-bytes representing an integer key + * + * @return negative if a < b, 0 if a == b, positive otherwise + */ + protected final int compareIntKeys(final byte[] a, final byte[] b) { + + final int iA = byteToInt(a); + final int iB = byteToInt(b); + + // protect against int key calculation overflow + final double diff = (double)iA - iB; + final int result; + if (diff < Integer.MIN_VALUE) { + result = Integer.MIN_VALUE; + } else if(diff > Integer.MAX_VALUE) { + result = Integer.MAX_VALUE; + } else { + result = (int)diff; + } + + return result; + } + + /** + * Utility method for deleting database files + * + * @param db_path The path to the database to remove + * from the filesystem + */ + private static void removeDb(final Path db_path) throws IOException { + Files.walkFileTree(db_path, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs) + throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(final Path file, IOException exc) + throws IOException { + // try to delete the file anyway, even if its attributes + // could not be read, since delete-only access is + // theoretically possible + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(final Path dir, IOException exc) + throws IOException { + if (exc == null) { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } else { + // directory iteration failed; propagate exception + throw exc; + } + } + }); + } +} diff --git a/java/org/rocksdb/test/ComparatorOptionsTest.java b/java/org/rocksdb/test/ComparatorOptionsTest.java new file mode 100644 index 000000000..e25209392 --- /dev/null +++ b/java/org/rocksdb/test/ComparatorOptionsTest.java @@ -0,0 +1,34 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import org.rocksdb.ComparatorOptions; +import org.rocksdb.RocksDB; + +import java.util.Random; + +public class ComparatorOptionsTest { + + static { + RocksDB.loadLibrary(); + } + + public static void main(String[] args) { + final ComparatorOptions copt = new ComparatorOptions(); + Random rand = new Random(); + + { // UseAdaptiveMutex test + copt.setUseAdaptiveMutex(true); + assert(copt.useAdaptiveMutex() == true); + + copt.setUseAdaptiveMutex(false); + assert(copt.useAdaptiveMutex() == false); + } + + copt.dispose(); + System.out.println("Passed ComparatorOptionsTest"); + } +} diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java new file mode 100644 index 000000000..34d7c78df --- /dev/null +++ b/java/org/rocksdb/test/ComparatorTest.java @@ -0,0 +1,45 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import org.rocksdb.*; + +import java.io.IOException; +import java.nio.file.FileSystems; + +public class ComparatorTest { + private static final String db_path = "/tmp/comparator_db"; + + static { + RocksDB.loadLibrary(); + } + + public static void main(String[] args) throws IOException { + + final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() { + @Override + public AbstractComparator getAscendingIntKeyComparator() { + return new Comparator(new ComparatorOptions()) { + + @Override + public String name() { + return "test.AscendingIntKeyComparator"; + } + + @Override + public int compare(final Slice a, final Slice b) { + return compareIntKeys(a.data(), b.data()); + } + }; + } + }; + + // test the round-tripability of keys written and read with the Comparator + comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(db_path)); + + System.out.println("Passed ComparatorTest"); + } +} diff --git a/java/org/rocksdb/test/DirectComparatorTest.java b/java/org/rocksdb/test/DirectComparatorTest.java new file mode 100644 index 000000000..9df06eb73 --- /dev/null +++ b/java/org/rocksdb/test/DirectComparatorTest.java @@ -0,0 +1,48 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import org.rocksdb.*; + +import java.io.IOException; +import java.nio.file.FileSystems; + +public class DirectComparatorTest { + private static final String db_path = "/tmp/direct_comparator_db"; + + static { + RocksDB.loadLibrary(); + } + + public static void main(String[] args) throws IOException { + + final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() { + @Override + public AbstractComparator getAscendingIntKeyComparator() { + return new DirectComparator(new ComparatorOptions()) { + + @Override + public String name() { + return "test.AscendingIntKeyDirectComparator"; + } + + @Override + public int compare(final DirectSlice a, final DirectSlice b) { + final byte ax[] = new byte[4], bx[] = new byte[4]; + a.data().get(ax); + b.data().get(bx); + return compareIntKeys(ax, bx); + } + }; + } + }; + + // test the round-tripability of keys written and read with the DirectComparator + comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(db_path)); + + System.out.println("Passed DirectComparatorTest"); + } +} diff --git a/java/org/rocksdb/test/MemTableTest.java b/java/org/rocksdb/test/MemTableTest.java new file mode 100644 index 000000000..0d1e4d54a --- /dev/null +++ b/java/org/rocksdb/test/MemTableTest.java @@ -0,0 +1,107 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import org.rocksdb.*; + +public class MemTableTest { + static { + RocksDB.loadLibrary(); + } + public static void main(String[] args) { + Options options = new Options(); + // Test HashSkipListMemTableConfig + HashSkipListMemTableConfig memTableConfig = + new HashSkipListMemTableConfig(); + assert(memTableConfig.bucketCount() == 1000000); + memTableConfig.setBucketCount(2000000); + assert(memTableConfig.bucketCount() == 2000000); + assert(memTableConfig.height() == 4); + memTableConfig.setHeight(5); + assert(memTableConfig.height() == 5); + assert(memTableConfig.branchingFactor() == 4); + memTableConfig.setBranchingFactor(6); + assert(memTableConfig.branchingFactor() == 6); + try { + options.setMemTableConfig(memTableConfig); + } catch (RocksDBException e) { + assert(false); + } + memTableConfig = null; + options.dispose(); + System.gc(); + System.runFinalization(); + // Test SkipList + options = new Options(); + SkipListMemTableConfig skipMemTableConfig = + new SkipListMemTableConfig(); + assert(skipMemTableConfig.lookahead() == 0); + skipMemTableConfig.setLookahead(20); + assert(skipMemTableConfig.lookahead() == 20); + try { + options.setMemTableConfig(skipMemTableConfig); + } catch (RocksDBException e) { + assert(false); + } + skipMemTableConfig = null; + options.dispose(); + System.gc(); + System.runFinalization(); + // Test HashLinkedListMemTableConfig + options = new Options(); + HashLinkedListMemTableConfig hashLinkedListMemTableConfig = + new HashLinkedListMemTableConfig(); + assert(hashLinkedListMemTableConfig.bucketCount() == 50000); + hashLinkedListMemTableConfig.setBucketCount(100000); + assert(hashLinkedListMemTableConfig.bucketCount() == 100000); + assert(hashLinkedListMemTableConfig.hugePageTlbSize() == 0); + hashLinkedListMemTableConfig.setHugePageTlbSize(1); + assert(hashLinkedListMemTableConfig.hugePageTlbSize() == 1); + assert(hashLinkedListMemTableConfig. + bucketEntriesLoggingThreshold() == 4096); + hashLinkedListMemTableConfig. + setBucketEntriesLoggingThreshold(200); + assert(hashLinkedListMemTableConfig. + bucketEntriesLoggingThreshold() == 200); + assert(hashLinkedListMemTableConfig. + ifLogBucketDistWhenFlush() == true); + hashLinkedListMemTableConfig. + setIfLogBucketDistWhenFlush(false); + assert(hashLinkedListMemTableConfig. + ifLogBucketDistWhenFlush() == false); + assert(hashLinkedListMemTableConfig. + thresholdUseSkiplist() == 256); + hashLinkedListMemTableConfig.setThresholdUseSkiplist(29); + assert(hashLinkedListMemTableConfig. + thresholdUseSkiplist() == 29); + try { + options.setMemTableConfig(hashLinkedListMemTableConfig); + } catch (RocksDBException e) { + assert(false); + } + hashLinkedListMemTableConfig = null; + options.dispose(); + System.gc(); + System.runFinalization(); + // test VectorMemTableConfig + options = new Options(); + VectorMemTableConfig vectorMemTableConfig = + new VectorMemTableConfig(); + assert(vectorMemTableConfig.reservedSize() == 0); + vectorMemTableConfig.setReservedSize(123); + assert(vectorMemTableConfig.reservedSize() == 123); + try { + options.setMemTableConfig(vectorMemTableConfig); + } catch (RocksDBException e) { + assert(false); + } + vectorMemTableConfig = null; + options.dispose(); + System.gc(); + System.runFinalization(); + System.out.println("Mem-table test passed"); + } +} diff --git a/java/org/rocksdb/test/Types.java b/java/org/rocksdb/test/Types.java new file mode 100644 index 000000000..22fcd3537 --- /dev/null +++ b/java/org/rocksdb/test/Types.java @@ -0,0 +1,43 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +/** + * Simple type conversion methods + * for use in tests + */ +public class Types { + + /** + * Convert first 4 bytes of a byte array to an int + * + * @param data The byte array + * + * @return An integer + */ + public static int byteToInt(final byte data[]) { + return (data[0] & 0xff) | + ((data[1] & 0xff) << 8) | + ((data[2] & 0xff) << 16) | + ((data[3] & 0xff) << 24); + } + + /** + * Convert an int to 4 bytes + * + * @param v The int + * + * @return A byte array containing 4 bytes + */ + public static byte[] intToByte(final int v) { + return new byte[] { + (byte)((v >>> 0) & 0xff), + (byte)((v >>> 8) & 0xff), + (byte)((v >>> 16) & 0xff), + (byte)((v >>> 24) & 0xff) + }; + } +} diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc new file mode 100644 index 000000000..196376235 --- /dev/null +++ b/java/rocksjni/comparator.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::Comparator. + +#include +#include +#include +#include +#include + +#include "include/org_rocksdb_AbstractComparator.h" +#include "include/org_rocksdb_Comparator.h" +#include "include/org_rocksdb_DirectComparator.h" +#include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/portal.h" + +// + +/* + * Class: org_rocksdb_Comparator + * Method: createNewComparator0 + * Signature: ()V + */ +void Java_org_rocksdb_Comparator_createNewComparator0( + JNIEnv* env, jobject jobj, jlong copt_handle) { + const rocksdb::ComparatorJniCallbackOptions* copt = + reinterpret_cast(copt_handle); + const rocksdb::ComparatorJniCallback* c = + new rocksdb::ComparatorJniCallback(env, jobj, copt); + rocksdb::AbstractComparatorJni::setHandle(env, jobj, c); +} +// + +// use_adaptive_mutex)), + mtx_findShortestSeparator(new port::Mutex(copt->use_adaptive_mutex)) { + + // Note: Comparator methods may be accessed by multiple threads, + // so we ref the jvm not the env + const jint rs = env->GetJavaVM(&m_jvm); + assert(rs == JNI_OK); + + // Note: we want to access the Java Comparator instance + // across multiple method calls, so we create a global ref + m_jComparator = env->NewGlobalRef(jComparator); + + // Note: The name of a Comparator will not change during it's lifetime, + // so we cache it in a global var + jmethodID jNameMethodId = AbstractComparatorJni::getNameMethodId(env); + jstring jsName = (jstring)env->CallObjectMethod(m_jComparator, jNameMethodId); + m_name = JniUtil::copyString(env, jsName); // also releases jsName + + m_jCompareMethodId = AbstractComparatorJni::getCompareMethodId(env); + m_jFindShortestSeparatorMethodId = + AbstractComparatorJni::getFindShortestSeparatorMethodId(env); + m_jFindShortSuccessorMethodId = + AbstractComparatorJni::getFindShortSuccessorMethodId(env); +} + +/** + * Attach/Get a JNIEnv for the current native thread + */ +JNIEnv* BaseComparatorJniCallback::getJniEnv() const { + JNIEnv *env; + jint rs = m_jvm->AttachCurrentThread(reinterpret_cast(&env), NULL); + assert(rs == JNI_OK); + return env; +} + +const char* BaseComparatorJniCallback::Name() const { + return m_name.c_str(); +} + +int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const { + JNIEnv* m_env = getJniEnv(); + + // TODO(adamretter): slice objects can potentially be cached using thread + // local variables to avoid locking. Could make this configurable depending on + // performance. + mtx_compare->Lock(); + + AbstractSliceJni::setHandle(m_env, m_jSliceA, &a); + AbstractSliceJni::setHandle(m_env, m_jSliceB, &b); + jint result = + m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA, + m_jSliceB); + + mtx_compare->Unlock(); + + m_jvm->DetachCurrentThread(); + + return result; +} + +void BaseComparatorJniCallback::FindShortestSeparator( + std::string* start, const Slice& limit) const { + if (start == nullptr) { + return; + } + + JNIEnv* m_env = getJniEnv(); + + const char* startUtf = start->c_str(); + jstring jsStart = m_env->NewStringUTF(startUtf); + + // TODO(adamretter): slice object can potentially be cached using thread local + // variable to avoid locking. Could make this configurable depending on + // performance. + mtx_findShortestSeparator->Lock(); + + AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit); + jstring jsResultStart = + (jstring)m_env->CallObjectMethod(m_jComparator, + m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit); + + mtx_findShortestSeparator->Unlock(); + + m_env->DeleteLocalRef(jsStart); + + if (jsResultStart != nullptr) { + // update start with result + *start = + JniUtil::copyString(m_env, jsResultStart); // also releases jsResultStart + } + + m_jvm->DetachCurrentThread(); +} + +void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const { + if (key == nullptr) { + return; + } + + JNIEnv* m_env = getJniEnv(); + + const char* keyUtf = key->c_str(); + jstring jsKey = m_env->NewStringUTF(keyUtf); + + jstring jsResultKey = + (jstring)m_env->CallObjectMethod(m_jComparator, + m_jFindShortSuccessorMethodId, jsKey); + + m_env->DeleteLocalRef(jsKey); + + if (jsResultKey != nullptr) { + // updates key with result, also releases jsResultKey. + *key = JniUtil::copyString(m_env, jsResultKey); + } + + m_jvm->DetachCurrentThread(); +} + +BaseComparatorJniCallback::~BaseComparatorJniCallback() { + JNIEnv* m_env = getJniEnv(); + + m_env->DeleteGlobalRef(m_jComparator); + + // Note: do not need to explicitly detach, as this function is effectively + // called from the Java class's disposeInternal method, and so already + // has an attached thread, getJniEnv above is just a no-op Attach to get + // the env jvm->DetachCurrentThread(); +} + +ComparatorJniCallback::ComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt) : + BaseComparatorJniCallback(env, jComparator, copt) { + m_jSliceA = env->NewGlobalRef(SliceJni::construct0(env)); + m_jSliceB = env->NewGlobalRef(SliceJni::construct0(env)); + m_jSliceLimit = env->NewGlobalRef(SliceJni::construct0(env)); +} + +ComparatorJniCallback::~ComparatorJniCallback() { + JNIEnv* m_env = getJniEnv(); + m_env->DeleteGlobalRef(m_jSliceA); + m_env->DeleteGlobalRef(m_jSliceB); + m_env->DeleteGlobalRef(m_jSliceLimit); +} + +DirectComparatorJniCallback::DirectComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt) : + BaseComparatorJniCallback(env, jComparator, copt) { + m_jSliceA = env->NewGlobalRef(DirectSliceJni::construct0(env)); + m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env)); + m_jSliceLimit = env->NewGlobalRef(DirectSliceJni::construct0(env)); +} + +DirectComparatorJniCallback::~DirectComparatorJniCallback() { + JNIEnv* m_env = getJniEnv(); + m_env->DeleteGlobalRef(m_jSliceA); + m_env->DeleteGlobalRef(m_jSliceB); + m_env->DeleteGlobalRef(m_jSliceLimit); +} +} // namespace rocksdb diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h new file mode 100644 index 000000000..65b986ca4 --- /dev/null +++ b/java/rocksjni/comparatorjnicallback.h @@ -0,0 +1,95 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::Comparator and rocksdb::DirectComparator. + +#ifndef JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ +#define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ + +#include +#include +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "port/port.h" + +namespace rocksdb { + +struct ComparatorJniCallbackOptions { + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex; + + ComparatorJniCallbackOptions() : use_adaptive_mutex(false) { + } +}; + +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB storage engine (C++) + * we then callback to the appropriate Java method + * this enables Comparators to be implemented in Java. + * + * The design of this Comparator caches the Java Slice + * objects that are used in the compare and findShortestSeparator + * method callbacks. Instead of creating new objects for each callback + * of those functions, by reuse via setHandle we are a lot + * faster; Unfortunately this means that we have to + * introduce independent locking in regions of each of those methods + * via the mutexs mtx_compare and mtx_findShortestSeparator respectively + */ +class BaseComparatorJniCallback : public Comparator { + public: + BaseComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt); + virtual ~BaseComparatorJniCallback(); + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + private: + // used for synchronisation in compare method + port::Mutex* mtx_compare; + // used for synchronisation in findShortestSeparator method + port::Mutex* mtx_findShortestSeparator; + JavaVM* m_jvm; + jobject m_jComparator; + std::string m_name; + jmethodID m_jCompareMethodId; + jmethodID m_jFindShortestSeparatorMethodId; + jmethodID m_jFindShortSuccessorMethodId; + + protected: + JNIEnv* getJniEnv() const; + jobject m_jSliceA; + jobject m_jSliceB; + jobject m_jSliceLimit; +}; + +class ComparatorJniCallback : public BaseComparatorJniCallback { + public: + ComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt); + ~ComparatorJniCallback(); +}; + +class DirectComparatorJniCallback : public BaseComparatorJniCallback { + public: + DirectComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt); + ~DirectComparatorJniCallback(); +}; +} // namespace rocksdb + +#endif // JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc index 4be03d491..fe83885c2 100644 --- a/java/rocksjni/memtablejni.cc +++ b/java/rocksjni/memtablejni.cc @@ -34,16 +34,26 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( /* * Class: org_rocksdb_HashLinkedListMemTableConfig * Method: newMemTableFactoryHandle - * Signature: (J)J + * Signature: (JJIZI)J */ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( - JNIEnv* env, jobject jobj, jlong jbucket_count) { - rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count); - if (s.ok()) { + JNIEnv* env, jobject jobj, jlong jbucket_count, jlong jhuge_page_tlb_size, + jint jbucket_entries_logging_threshold, + jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) { + rocksdb::Status statusBucketCount = + rocksdb::check_if_jlong_fits_size_t(jbucket_count); + rocksdb::Status statusHugePageTlb = + rocksdb::check_if_jlong_fits_size_t(jhuge_page_tlb_size); + if (statusBucketCount.ok() && statusHugePageTlb.ok()) { return reinterpret_cast(rocksdb::NewHashLinkListRepFactory( - static_cast(jbucket_count))); + static_cast(jbucket_count), + static_cast(jhuge_page_tlb_size), + static_cast(jbucket_entries_logging_threshold), + static_cast(jif_log_bucket_dist_when_flash), + static_cast(jthreshold_use_skiplist))); } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + rocksdb::RocksDBExceptionJni::ThrowNew(env, + !statusBucketCount.ok()?statusBucketCount:statusHugePageTlb); return 0; } @@ -66,9 +76,15 @@ jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( /* * Class: org_rocksdb_SkipListMemTableConfig * Method: newMemTableFactoryHandle0 - * Signature: ()J + * Signature: (J)J */ jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0( - JNIEnv* env, jobject jobj) { - return reinterpret_cast(new rocksdb::SkipListFactory()); + JNIEnv* env, jobject jobj, jlong jlookahead) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jlookahead); + if (s.ok()) { + return reinterpret_cast(new rocksdb::SkipListFactory( + static_cast(jlookahead))); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 1a43c4966..3b8e5d7c4 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -14,6 +14,9 @@ #include "include/org_rocksdb_Options.h" #include "include/org_rocksdb_WriteOptions.h" #include "include/org_rocksdb_ReadOptions.h" +#include "include/org_rocksdb_ComparatorOptions.h" + +#include "rocksjni/comparatorjnicallback.h" #include "rocksjni/portal.h" #include "rocksdb/db.h" #include "rocksdb/options.h" @@ -166,6 +169,17 @@ jlong Java_org_rocksdb_Options_statisticsPtr( return reinterpret_cast(st); } +/* + * Class: org_rocksdb_Options + * Method: setComparatorHandle + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setComparatorHandle( + JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) { + reinterpret_cast(jopt_handle)->comparator = + reinterpret_cast(jcomparator_handle); +} + /* * Class: org_rocksdb_Options * Method: maxWriteBufferNumber @@ -1785,3 +1799,49 @@ void Java_org_rocksdb_ReadOptions_setTailing( reinterpret_cast(jhandle)->tailing = static_cast(jtailing); } + +///////////////////////////////////////////////////////////////////// +// rocksdb::ComparatorOptions +/* + * Class: org_rocksdb_ComparatorOptions + * Method: newComparatorOptions + * Signature: ()V + */ +void Java_org_rocksdb_ComparatorOptions_newComparatorOptions( + JNIEnv* env, jobject jobj) { + auto comparator_opt = new rocksdb::ComparatorJniCallbackOptions(); + rocksdb::ComparatorOptionsJni::setHandle(env, jobj, comparator_opt); +} + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: useAdaptiveMutex + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex( + JNIEnv * env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->use_adaptive_mutex; +} + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: setUseAdaptiveMutex + * Signature: (JZ)V + */ +void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex( + JNIEnv * env, jobject jobj, jlong jhandle, jboolean juse_adaptive_mutex) { + reinterpret_cast(jhandle) + ->use_adaptive_mutex = static_cast(juse_adaptive_mutex); +} + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ComparatorOptions_disposeInternal( + JNIEnv * env, jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); + rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr); +} diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 8300a6e66..32452ae0b 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -12,10 +12,13 @@ #include #include +#include + #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" #include "rocksdb/status.h" #include "rocksdb/utilities/backupable_db.h" +#include "rocksjni/comparatorjnicallback.h" namespace rocksdb { @@ -362,6 +365,168 @@ class ColumnFamilyHandleJni { } }; +class ComparatorOptionsJni { + public: + // Get the java class id of org.rocksdb.ComparatorOptions. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/ComparatorOptions"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.ComparatorOptions + // that stores the pointer to rocksdb::ComparatorJniCallbackOptions. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Pass the ComparatorJniCallbackOptions pointer to the java side. + static void setHandle( + JNIEnv* env, jobject jobj, + const rocksdb::ComparatorJniCallbackOptions* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class AbstractComparatorJni { + public: + // Get the java class id of org.rocksdb.Comparator. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/AbstractComparator"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.Comparator + // that stores the pointer to rocksdb::Comparator. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the java method `name` of org.rocksdb.Comparator. + static jmethodID getNameMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "name", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + // Get the java method `compare` of org.rocksdb.Comparator. + static jmethodID getCompareMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), + "compare", + "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I"); + assert(mid != nullptr); + return mid; + } + + // Get the java method `findShortestSeparator` of org.rocksdb.Comparator. + static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), + "findShortestSeparator", + "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + // Get the java method `findShortSuccessor` of org.rocksdb.Comparator. + static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), + "findShortSuccessor", + "(Ljava/lang/String;)Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + // Get the pointer to ComparatorJniCallback. + static rocksdb::BaseComparatorJniCallback* getHandle( + JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the ComparatorJniCallback pointer to the java side. + static void setHandle( + JNIEnv* env, jobject jobj, const rocksdb::BaseComparatorJniCallback* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class AbstractSliceJni { + public: + // Get the java class id of org.rocksdb.Slice. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/AbstractSlice"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.Slice + // that stores the pointer to rocksdb::Slice. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to Slice. + static rocksdb::Slice* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the Slice pointer to the java side. + static void setHandle( + JNIEnv* env, jobject jobj, const rocksdb::Slice* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class SliceJni { + public: + // Get the java class id of org.rocksdb.Slice. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/Slice"); + assert(jclazz != nullptr); + return jclazz; + } + + static jobject construct0(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), "", "()V"); + assert(mid != nullptr); + return env->NewObject(getJClass(env), mid); + } +}; + +class DirectSliceJni { + public: + // Get the java class id of org.rocksdb.DirectSlice. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/DirectSlice"); + assert(jclazz != nullptr); + return jclazz; + } + + static jobject construct0(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), "", "()V"); + assert(mid != nullptr); + return env->NewObject(getJClass(env), mid); + } +}; + class ListJni { public: // Get the java class id of java.util.List. @@ -425,5 +590,20 @@ class ListJni { return mid; } }; + +class JniUtil { + public: + /** + * Copies a jstring to a std::string + * and releases the original jstring + */ + static std::string copyString(JNIEnv* env, jstring js) { + const char *utf = env->GetStringUTFChars(js, NULL); + std::string name(utf); + env->ReleaseStringUTFChars(js, utf); + return name; + } +}; + } // namespace rocksdb #endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc new file mode 100644 index 000000000..0d8b92c9c --- /dev/null +++ b/java/rocksjni/slice.cc @@ -0,0 +1,251 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::Slice. + +#include +#include +#include +#include + +#include "include/org_rocksdb_AbstractSlice.h" +#include "include/org_rocksdb_Slice.h" +#include "include/org_rocksdb_DirectSlice.h" +#include "rocksdb/slice.h" +#include "rocksjni/portal.h" + +// + +/* + * Class: org_rocksdb_Slice + * Method: createNewSlice0 + * Signature: ([BI)V + */ +void Java_org_rocksdb_Slice_createNewSlice0( + JNIEnv * env, jobject jobj, jbyteArray data, jint offset) { + + const jsize dataSize = env->GetArrayLength(data); + const int len = dataSize - offset; + jbyte* ptrData = new jbyte[len]; + env->GetByteArrayRegion(data, offset, len, ptrData); + + const rocksdb::Slice* slice = new rocksdb::Slice((const char*)ptrData, len); + rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); +} + +/* + * Class: org_rocksdb_Slice + * Method: createNewSlice1 + * Signature: ([B)V + */ +void Java_org_rocksdb_Slice_createNewSlice1( + JNIEnv * env, jobject jobj, jbyteArray data) { + + const int len = env->GetArrayLength(data); + + jboolean isCopy; + jbyte* ptrData = env->GetByteArrayElements(data, &isCopy); + const char* buf = new char[len]; + memcpy(const_cast(buf), ptrData, len); + + const rocksdb::Slice* slice = + new rocksdb::Slice(buf, env->GetArrayLength(data)); + rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); + + env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT); + + // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method +} + +/* + * Class: org_rocksdb_Slice + * Method: data0 + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_Slice_data0( + JNIEnv* env, jobject jobj, jlong handle) { + const rocksdb::Slice* slice = reinterpret_cast(handle); + const int len = slice->size(); + const jbyteArray data = env->NewByteArray(len); + env->SetByteArrayRegion(data, 0, len, + reinterpret_cast(const_cast(slice->data()))); + return data; +} + +/* + * Class: org_rocksdb_Slice + * Method: disposeInternalBuf + * Signature: (J)V + */ +void Java_org_rocksdb_Slice_disposeInternalBuf( + JNIEnv * env, jobject jobj, jlong handle) { + const rocksdb::Slice* slice = reinterpret_cast(handle); + delete [] slice->data_; +} + +// + +// iter(new_table->NewIndexIterator(ReadOptions())); s = iter->status(); @@ -506,19 +506,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Set filter block if (rep->filter_policy) { - // First try reading full_filter, then reading block_based_filter - for (auto filter_block_prefix : { kFullFilterBlockPrefix, - kFilterBlockPrefix }) { - std::string key = filter_block_prefix; - key.append(rep->filter_policy->Name()); - - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { - rep->filter.reset(ReadFilter(handle, rep, - filter_block_prefix, nullptr)); - break; - } - } + rep->filter.reset(ReadFilter(rep, meta_iter.get(), nullptr)); } } else { delete index_reader; @@ -726,33 +714,43 @@ Status BlockBasedTable::PutDataBlockToCache( } FilterBlockReader* BlockBasedTable::ReadFilter( - const BlockHandle& filter_handle, BlockBasedTable::Rep* rep, - const std::string& filter_block_prefix, size_t* filter_size) { + Rep* rep, Iterator* meta_index_iter, size_t* filter_size) { // TODO: We might want to unify with ReadBlockFromFile() if we start // requiring checksum verification in Table::Open. - ReadOptions opt; - BlockContents block; - if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle, - &block, rep->ioptions.env, false).ok()) { - return nullptr; - } + for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) { + std::string filter_block_key = prefix; + filter_block_key.append(rep->filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(meta_index_iter, filter_block_key, &handle).ok()) { + BlockContents block; + if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(), + handle, &block, rep->ioptions.env, false).ok()) { + // Error reading the block + return nullptr; + } - if (filter_size) { - *filter_size = block.data.size(); - } + if (filter_size) { + *filter_size = block.data.size(); + } - assert(rep->filter_policy); - if (kFilterBlockPrefix == filter_block_prefix) { - return new BlockBasedFilterBlockReader( - rep->ioptions.prefix_extractor, rep->table_options, std::move(block)); - } else if (kFullFilterBlockPrefix == filter_block_prefix) { - auto filter_bits_reader = rep->filter_policy-> - GetFilterBitsReader(block.data); - - if (filter_bits_reader != nullptr) { - return new FullFilterBlockReader(rep->ioptions.prefix_extractor, - rep->table_options, std::move(block), - filter_bits_reader); + assert(rep->filter_policy); + if (kFilterBlockPrefix == prefix) { + return new BlockBasedFilterBlockReader( + rep->ioptions.prefix_extractor, rep->table_options, + std::move(block)); + } else if (kFullFilterBlockPrefix == prefix) { + auto filter_bits_reader = rep->filter_policy-> + GetFilterBitsReader(block.data); + if (filter_bits_reader != nullptr) { + return new FullFilterBlockReader(rep->ioptions.prefix_extractor, + rep->table_options, + std::move(block), + filter_bits_reader); + } + } else { + assert(false); + return nullptr; + } } } return nullptr; @@ -760,8 +758,11 @@ FilterBlockReader* BlockBasedTable::ReadFilter( BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( bool no_io) const { - // filter pre-populated - if (rep_->filter != nullptr) { + // If cache_index_and_filter_blocks is false, filter should be pre-populated. + // We will return rep_->filter anyway. rep_->filter can be nullptr if filter + // read fails at Open() time. We don't want to reload again since it will + // most probably fail again. + if (!rep_->table_options.cache_index_and_filter_blocks) { return {rep_->filter.get(), nullptr /* cache handle */}; } @@ -775,8 +776,7 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->footer.metaindex_handle(), - cache_key - ); + cache_key); Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = @@ -797,22 +797,12 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( auto s = ReadMetaBlock(rep_, &meta, &iter); if (s.ok()) { - // First try reading full_filter, then reading block_based_filter - for (auto filter_block_prefix : {kFullFilterBlockPrefix, - kFilterBlockPrefix}) { - std::string filter_block_key = filter_block_prefix; - filter_block_key.append(rep_->filter_policy->Name()); - BlockHandle handle; - if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { - filter = ReadFilter(handle, rep_, filter_block_prefix, &filter_size); - - if (filter == nullptr) break; // err happen in ReadFilter - assert(filter_size > 0); - cache_handle = block_cache->Insert( - key, filter, filter_size, &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - break; - } + filter = ReadFilter(rep_, iter.get(), &filter_size); + if (filter != nullptr) { + assert(filter_size > 0); + cache_handle = block_cache->Insert( + key, filter, filter_size, &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); } } } diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index b272c4d13..a000c6a9a 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -183,10 +183,10 @@ class BlockBasedTable : public TableReader { std::unique_ptr* iter); // Create the filter from the filter block. - static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, - Rep* rep, - const std::string& filter_block_prefix, - size_t* filter_size = nullptr); + static FilterBlockReader* ReadFilter( + Rep* rep, + Iterator* meta_index_iter, + size_t* filter_size = nullptr); static void SetupCacheKeyPrefix(Rep* rep); diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index 599908678..714fdc2a0 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -64,7 +64,8 @@ class CuckooTableFactory : public TableFactory { const CompressionType, const CompressionOptions&) const override; // Sanitizes the specified DB Options. - Status SanitizeDBOptions(const DBOptions* db_opts) const override { + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { return Status::OK(); } diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index e79475221..23b54f092 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -170,8 +170,9 @@ class PlainTableFactory : public TableFactory { static const char kValueTypeSeqId0 = 0xFF; // Sanitizes the specified DB Options. - Status SanitizeDBOptions(const DBOptions* db_opts) const override { - if (db_opts->allow_mmap_reads == false) { + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + if (db_opts.allow_mmap_reads == false) { return Status::NotSupported( "PlainTable with allow_mmap_reads == false is not supported."); } diff --git a/table/table_test.cc b/table/table_test.cc index e4657e8cd..5e1bbe4cf 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1461,8 +1461,6 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { options.create_if_missing = true; options.statistics = CreateDBStatistics(); BlockBasedTableOptions table_options; - // Intentionally commented out: table_options.cache_index_and_filter_blocks = - // true; table_options.block_cache = NewLRUCache(1024); table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); @@ -1521,7 +1519,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. - auto reader = dynamic_cast(c.GetTableReader()); + auto* reader = dynamic_cast(c.GetTableReader()); ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); @@ -1567,28 +1565,11 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { // release the iterator so that the block cache can reset correctly. iter.reset(); - // -- PART 2: Open without block cache - table_options.no_block_cache = true; - table_options.block_cache.reset(); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - options.statistics = CreateDBStatistics(); // reset the stats - const ImmutableCFOptions ioptions1(options); - c.Reopen(ioptions1); - table_options.no_block_cache = false; - - { - iter.reset(c.NewIterator()); - iter->SeekToFirst(); - ASSERT_EQ("key", iter->key().ToString()); - BlockCachePropertiesSnapshot props(options.statistics.get()); - // Nothing is affected at all - props.AssertEqual(0, 0, 0, 0); - } - - // -- PART 3: Open with very small block cache + // -- PART 2: Open with very small block cache // In this test, no block will ever get hit since the block cache is // too small to fit even one entry. table_options.block_cache = NewLRUCache(1); + options.statistics = CreateDBStatistics(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); const ImmutableCFOptions ioptions2(options); c.Reopen(ioptions2); @@ -1598,7 +1579,6 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { 0, 0, 0); } - { // Both index and data block get accessed. // It first cache index block then data block. But since the cache size @@ -1618,6 +1598,37 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { props.AssertEqual(2, 0, 0 + 1, // data block miss 0); } + iter.reset(); + + // -- PART 3: Open table with bloom filter enabled but not in SST file + table_options.block_cache = NewLRUCache(4096); + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c3(BytewiseComparator()); + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + c3.Add(internal_key.Encode().ToString(), "hello"); + ImmutableCFOptions ioptions3(options); + // Generate table without filter policy + c3.Finish(options, ioptions3, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + // Open table with filter policy + table_options.filter_policy.reset(NewBloomFilterPolicy(1)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + ImmutableCFOptions ioptions4(options); + ASSERT_OK(c3.Reopen(ioptions4)); + reader = dynamic_cast(c3.GetTableReader()); + ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + std::string value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, + nullptr, nullptr); + ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context)); + ASSERT_EQ(value, "hello"); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertFilterBlockStat(0, 0); } TEST(BlockBasedTableTest, BlockCacheLeak) { diff --git a/util/env_posix.cc b/util/env_posix.cc index cf917e874..76ba4a6bd 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include #include #include @@ -1350,25 +1351,13 @@ class PosixEnv : public Env { } virtual uint64_t NowMicros() { - struct timeval tv; - // TODO(kailiu) MAC DON'T HAVE THIS - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); } virtual uint64_t NowNanos() { -#ifdef OS_LINUX - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#elif __MACH__ - clock_serv_t cclock; - mach_timespec_t ts; - host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); - clock_get_time(cclock, &ts); - mach_port_deallocate(mach_task_self(), cclock); -#endif - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count(); } virtual void SleepForMicroseconds(int micros) { diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc index 1b3197b18..1bc8a5b7d 100644 --- a/util/mutable_cf_options.cc +++ b/util/mutable_cf_options.cc @@ -126,6 +126,10 @@ void MutableCFOptions::Dump(Logger* log) const { } result.resize(result.size() - 2); Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str()); + Log(log, " max_mem_compaction_level: %d", + max_mem_compaction_level); + Log(log, " max_sequential_skip_in_iterations: %" PRIu64, + max_sequential_skip_in_iterations); } } // namespace rocksdb diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h index eaecaa487..a738e7978 100644 --- a/util/mutable_cf_options.h +++ b/util/mutable_cf_options.h @@ -37,7 +37,10 @@ struct MutableCFOptions { max_bytes_for_level_base(options.max_bytes_for_level_base), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), max_bytes_for_level_multiplier_additional( - options.max_bytes_for_level_multiplier_additional) + options.max_bytes_for_level_multiplier_additional), + max_mem_compaction_level(options.max_mem_compaction_level), + max_sequential_skip_in_iterations( + options.max_sequential_skip_in_iterations) { RefreshDerivedOptions(ioptions); } @@ -62,7 +65,9 @@ struct MutableCFOptions { target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), - max_bytes_for_level_multiplier(0) + max_bytes_for_level_multiplier(0), + max_mem_compaction_level(0), + max_sequential_skip_in_iterations(0) {} // Must be called after any change to MutableCFOptions @@ -105,6 +110,10 @@ struct MutableCFOptions { uint64_t max_bytes_for_level_base; int max_bytes_for_level_multiplier; std::vector max_bytes_for_level_multiplier_additional; + int max_mem_compaction_level; + + // Misc options + uint64_t max_sequential_skip_in_iterations; // Derived options // Per-level target file size. diff --git a/util/options_helper.cc b/util/options_helper.cc index 2a56a1ccf..4fef52299 100644 --- a/util/options_helper.cc +++ b/util/options_helper.cc @@ -144,6 +144,19 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value, start = end + 1; } } + } else if (name == "max_mem_compaction_level") { + new_options->max_mem_compaction_level = ParseInt(value); + } else { + return false; + } + return true; +} + +template +bool ParseMiscOptions(const std::string& name, const std::string& value, + OptionsType* new_options) { + if (name == "max_sequential_skip_in_iterations") { + new_options->max_sequential_skip_in_iterations = ParseUint64(value); } else { return false; } @@ -160,6 +173,7 @@ bool GetMutableOptionsFromStrings( for (const auto& o : options_map) { if (ParseMemtableOptions(o.first, o.second, new_options)) { } else if (ParseCompactionOptions(o.first, o.second, new_options)) { + } else if (ParseMiscOptions(o.first, o.second, new_options)) { } else { return false; } @@ -228,6 +242,7 @@ bool GetColumnFamilyOptionsFromMap( try { if (ParseMemtableOptions(o.first, o.second, new_options)) { } else if (ParseCompactionOptions(o.first, o.second, new_options)) { + } else if (ParseMiscOptions(o.first, o.second, new_options)) { } else if (o.first == "min_write_buffer_number_to_merge") { new_options->min_write_buffer_number_to_merge = ParseInt(o.second); } else if (o.first == "compression") { @@ -270,8 +285,6 @@ bool GetColumnFamilyOptionsFromMap( ParseInt(o.second.substr(start, o.second.size() - start)); } else if (o.first == "num_levels") { new_options->num_levels = ParseInt(o.second); - } else if (o.first == "max_mem_compaction_level") { - new_options->max_mem_compaction_level = ParseInt(o.second); } else if (o.first == "purge_redundant_kvs_while_flush") { new_options->purge_redundant_kvs_while_flush = ParseBoolean(o.first, o.second); @@ -286,8 +299,6 @@ bool GetColumnFamilyOptionsFromMap( } else if (o.first == "compaction_options_fifo") { new_options->compaction_options_fifo.max_table_files_size = ParseUint64(o.second); - } else if (o.first == "max_sequential_skip_in_iterations") { - new_options->max_sequential_skip_in_iterations = ParseUint64(o.second); } else if (o.first == "inplace_update_support") { new_options->inplace_update_support = ParseBoolean(o.first, o.second); } else if (o.first == "inplace_update_num_locks") { diff --git a/util/thread_local.h b/util/thread_local.h index a037a9ceb..6884ed138 100644 --- a/util/thread_local.h +++ b/util/thread_local.h @@ -26,13 +26,14 @@ namespace rocksdb { // (2) a ThreadLocalPtr is destroyed typedef void (*UnrefHandler)(void* ptr); -// Thread local storage that only stores value of pointer type. The storage -// distinguish data coming from different thread and different ThreadLocalPtr -// instances. For example, if a regular thread_local variable A is declared -// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids -// the confliction. The total storage size equals to # of threads * # of -// ThreadLocalPtr instances. It is not efficient in terms of space, but it -// should serve most of our use cases well and keep code simple. +// ThreadLocalPtr stores only values of pointer type. Different from +// the usual thread-local-storage, ThreadLocalPtr has the ability to +// distinguish data coming from different threads and different +// ThreadLocalPtr instances. For example, if a regular thread_local +// variable A is declared in DBImpl, two DBImpl objects would share +// the same A. However, a ThreadLocalPtr that is defined under the +// scope of DBImpl can avoid such confliction. As a result, its memory +// usage would be O(# of threads * # of ThreadLocalPtr instances). class ThreadLocalPtr { public: explicit ThreadLocalPtr(UnrefHandler handler = nullptr);