Allow allocating dynamic bloom, plain table indexes and hash linked list from huge page TLB
Summary: Add an option to allocate a piece of memory from huge page TLB. Add options to trigger it in dynamic bloom, plain table indexes andhash linked list hash table. Test Plan: make all check Reviewers: haobo, ljin Reviewed By: haobo CC: nkg-, dhruba, leveldb, igor, yhchiang Differential Revision: https://reviews.facebook.net/D18357 Conflicts: db/plain_table_db_test.cc util/options.cc
This commit is contained in:
parent
eb96dc003a
commit
808928fc99
@ -438,7 +438,7 @@ class DBTest {
|
|||||||
break;
|
break;
|
||||||
case kHashLinkList:
|
case kHashLinkList:
|
||||||
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
||||||
options.memtable_factory.reset(NewHashLinkListRepFactory(4));
|
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
|
||||||
break;
|
break;
|
||||||
case kUniversalCompaction:
|
case kUniversalCompaction:
|
||||||
options.compaction_style = kCompactionStyleUniversal;
|
options.compaction_style = kCompactionStyleUniversal;
|
||||||
|
@ -52,9 +52,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
|
|||||||
// gone wrong already.
|
// gone wrong already.
|
||||||
assert(!should_flush_);
|
assert(!should_flush_);
|
||||||
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
|
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
|
||||||
prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
|
prefix_bloom_.reset(new DynamicBloom(
|
||||||
options.bloom_locality,
|
options.memtable_prefix_bloom_bits, options.bloom_locality,
|
||||||
options.memtable_prefix_bloom_probes));
|
options.memtable_prefix_bloom_probes, nullptr,
|
||||||
|
options.memtable_prefix_bloom_huge_page_tlb_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -188,7 +188,7 @@ class TestPlainTableReader : public PlainTableReader {
|
|||||||
const Options& options, bool* expect_bloom_not_match)
|
const Options& options, bool* expect_bloom_not_match)
|
||||||
: PlainTableReader(options, std::move(file), storage_options, icomparator,
|
: PlainTableReader(options, std::move(file), storage_options, icomparator,
|
||||||
file_size, bloom_bits_per_key, hash_table_ratio,
|
file_size, bloom_bits_per_key, hash_table_ratio,
|
||||||
index_sparseness, table_properties),
|
index_sparseness, table_properties, 2 * 1024 * 1024),
|
||||||
expect_bloom_not_match_(expect_bloom_not_match) {
|
expect_bloom_not_match_(expect_bloom_not_match) {
|
||||||
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
|
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
|
||||||
ASSERT_TRUE(s.ok());
|
ASSERT_TRUE(s.ok());
|
||||||
@ -209,13 +209,12 @@ extern const uint64_t kPlainTableMagicNumber;
|
|||||||
class TestPlainTableFactory : public PlainTableFactory {
|
class TestPlainTableFactory : public PlainTableFactory {
|
||||||
public:
|
public:
|
||||||
explicit TestPlainTableFactory(bool* expect_bloom_not_match,
|
explicit TestPlainTableFactory(bool* expect_bloom_not_match,
|
||||||
uint32_t user_key_len =
|
uint32_t user_key_len, int bloom_bits_per_key,
|
||||||
kPlainTableVariableLength,
|
double hash_table_ratio,
|
||||||
int bloom_bits_per_key = 0,
|
size_t index_sparseness,
|
||||||
double hash_table_ratio = 0.75,
|
size_t huge_page_tlb_size)
|
||||||
size_t index_sparseness = 16)
|
|
||||||
: PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
|
: PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
|
||||||
hash_table_ratio),
|
index_sparseness, huge_page_tlb_size),
|
||||||
bloom_bits_per_key_(bloom_bits_per_key),
|
bloom_bits_per_key_(bloom_bits_per_key),
|
||||||
hash_table_ratio_(hash_table_ratio),
|
hash_table_ratio_(hash_table_ratio),
|
||||||
index_sparseness_(index_sparseness),
|
index_sparseness_(index_sparseness),
|
||||||
@ -247,197 +246,208 @@ class TestPlainTableFactory : public PlainTableFactory {
|
|||||||
};
|
};
|
||||||
|
|
||||||
TEST(PlainTableDBTest, Flush) {
|
TEST(PlainTableDBTest, Flush) {
|
||||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||||
Options options = CurrentOptions();
|
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||||
options.create_if_missing = true;
|
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||||
// Set only one bucket to force bucket conflict.
|
Options options = CurrentOptions();
|
||||||
// Test index interval for the same prefix to be 1, 2 and 4
|
options.create_if_missing = true;
|
||||||
if (total_order) {
|
// Set only one bucket to force bucket conflict.
|
||||||
options.table_factory.reset(
|
// Test index interval for the same prefix to be 1, 2 and 4
|
||||||
NewTotalOrderPlainTableFactory(16, bloom_bits, 2));
|
if (total_order) {
|
||||||
} else {
|
options.table_factory.reset(NewTotalOrderPlainTableFactory(
|
||||||
options.table_factory.reset(NewPlainTableFactory(16, bloom_bits));
|
16, bloom_bits, 2, huge_page_tlb_size));
|
||||||
|
} else {
|
||||||
|
options.table_factory.reset(NewPlainTableFactory(
|
||||||
|
16, bloom_bits, 0.75, 16, huge_page_tlb_size));
|
||||||
|
}
|
||||||
|
DestroyAndReopen(&options);
|
||||||
|
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||||
|
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v3"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
|
||||||
|
TablePropertiesCollection ptc;
|
||||||
|
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
|
||||||
|
ASSERT_EQ(1U, ptc.size());
|
||||||
|
auto row = ptc.begin();
|
||||||
|
auto tp = row->second;
|
||||||
|
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
|
||||||
|
"plain_table_hash_table_size"));
|
||||||
|
ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
|
||||||
|
"plain_table_sub_index_size"));
|
||||||
|
|
||||||
|
ASSERT_EQ("v3", Get("1000000000000foo"));
|
||||||
|
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||||
}
|
}
|
||||||
DestroyAndReopen(&options);
|
|
||||||
|
|
||||||
ASSERT_OK(Put("1000000000000foo", "v1"));
|
|
||||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
|
||||||
ASSERT_OK(Put("1000000000000foo", "v3"));
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
|
||||||
|
|
||||||
TablePropertiesCollection ptc;
|
|
||||||
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
|
|
||||||
ASSERT_EQ(1U, ptc.size());
|
|
||||||
auto row = ptc.begin();
|
|
||||||
auto tp = row->second;
|
|
||||||
ASSERT_EQ(
|
|
||||||
total_order ? "4" : "12",
|
|
||||||
(tp->user_collected_properties).at("plain_table_hash_table_size"));
|
|
||||||
ASSERT_EQ(
|
|
||||||
total_order ? "9" : "0",
|
|
||||||
(tp->user_collected_properties).at("plain_table_sub_index_size"));
|
|
||||||
|
|
||||||
ASSERT_EQ("v3", Get("1000000000000foo"));
|
|
||||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(PlainTableDBTest, Flush2) {
|
TEST(PlainTableDBTest, Flush2) {
|
||||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||||
bool expect_bloom_not_match = false;
|
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||||
Options options = CurrentOptions();
|
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||||
options.create_if_missing = true;
|
bool expect_bloom_not_match = false;
|
||||||
// Set only one bucket to force bucket conflict.
|
Options options = CurrentOptions();
|
||||||
// Test index interval for the same prefix to be 1, 2 and 4
|
options.create_if_missing = true;
|
||||||
if (total_order) {
|
// Set only one bucket to force bucket conflict.
|
||||||
options.prefix_extractor = nullptr;
|
// Test index interval for the same prefix to be 1, 2 and 4
|
||||||
options.table_factory.reset(new TestPlainTableFactory(
|
|
||||||
&expect_bloom_not_match, 16, bloom_bits, 0, 2));
|
|
||||||
} else {
|
|
||||||
options.table_factory.reset(
|
|
||||||
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
|
|
||||||
}
|
|
||||||
DestroyAndReopen(&options);
|
|
||||||
ASSERT_OK(Put("0000000000000bar", "b"));
|
|
||||||
ASSERT_OK(Put("1000000000000foo", "v1"));
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
|
||||||
|
|
||||||
ASSERT_OK(Put("1000000000000foo", "v2"));
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
|
||||||
ASSERT_EQ("v2", Get("1000000000000foo"));
|
|
||||||
|
|
||||||
ASSERT_OK(Put("0000000000000eee", "v3"));
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
|
||||||
ASSERT_EQ("v3", Get("0000000000000eee"));
|
|
||||||
|
|
||||||
ASSERT_OK(Delete("0000000000000bar"));
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
|
|
||||||
|
|
||||||
ASSERT_OK(Put("0000000000000eee", "v5"));
|
|
||||||
ASSERT_OK(Put("9000000000000eee", "v5"));
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
|
||||||
ASSERT_EQ("v5", Get("0000000000000eee"));
|
|
||||||
|
|
||||||
// Test Bloom Filter
|
|
||||||
if (bloom_bits > 0) {
|
|
||||||
// Neither key nor value should exist.
|
|
||||||
expect_bloom_not_match = true;
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
|
|
||||||
|
|
||||||
// Key doesn't exist any more but prefix exists.
|
|
||||||
if (total_order) {
|
if (total_order) {
|
||||||
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
|
options.prefix_extractor = nullptr;
|
||||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
|
options.table_factory.reset(
|
||||||
|
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||||
|
0, 2, huge_page_tlb_size));
|
||||||
|
} else {
|
||||||
|
options.table_factory.reset(
|
||||||
|
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||||
|
0.75, 16, huge_page_tlb_size));
|
||||||
|
}
|
||||||
|
DestroyAndReopen(&options);
|
||||||
|
ASSERT_OK(Put("0000000000000bar", "b"));
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v2"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v2", Get("1000000000000foo"));
|
||||||
|
|
||||||
|
ASSERT_OK(Put("0000000000000eee", "v3"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v3", Get("0000000000000eee"));
|
||||||
|
|
||||||
|
ASSERT_OK(Delete("0000000000000bar"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
|
||||||
|
|
||||||
|
ASSERT_OK(Put("0000000000000eee", "v5"));
|
||||||
|
ASSERT_OK(Put("9000000000000eee", "v5"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v5", Get("0000000000000eee"));
|
||||||
|
|
||||||
|
// Test Bloom Filter
|
||||||
|
if (bloom_bits > 0) {
|
||||||
|
// Neither key nor value should exist.
|
||||||
|
expect_bloom_not_match = true;
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
|
||||||
|
|
||||||
|
// Key doesn't exist any more but prefix exists.
|
||||||
|
if (total_order) {
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
|
||||||
|
}
|
||||||
|
expect_bloom_not_match = false;
|
||||||
}
|
}
|
||||||
expect_bloom_not_match = false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(PlainTableDBTest, Iterator) {
|
TEST(PlainTableDBTest, Iterator) {
|
||||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||||
bool expect_bloom_not_match = false;
|
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||||
Options options = CurrentOptions();
|
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||||
options.create_if_missing = true;
|
bool expect_bloom_not_match = false;
|
||||||
// Set only one bucket to force bucket conflict.
|
Options options = CurrentOptions();
|
||||||
// Test index interval for the same prefix to be 1, 2 and 4
|
options.create_if_missing = true;
|
||||||
if (total_order) {
|
// Set only one bucket to force bucket conflict.
|
||||||
options.prefix_extractor = nullptr;
|
// Test index interval for the same prefix to be 1, 2 and 4
|
||||||
options.table_factory.reset(new TestPlainTableFactory(
|
if (total_order) {
|
||||||
&expect_bloom_not_match, 16, bloom_bits, 0, 2));
|
options.prefix_extractor = nullptr;
|
||||||
} else {
|
options.table_factory.reset(new TestPlainTableFactory(
|
||||||
options.table_factory.reset(
|
&expect_bloom_not_match, 16, bloom_bits, 0, 2, huge_page_tlb_size));
|
||||||
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
|
} else {
|
||||||
}
|
options.table_factory.reset(
|
||||||
DestroyAndReopen(&options);
|
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||||
|
0.75, 16, huge_page_tlb_size));
|
||||||
|
}
|
||||||
|
DestroyAndReopen(&options);
|
||||||
|
|
||||||
ASSERT_OK(Put("1000000000foo002", "v_2"));
|
ASSERT_OK(Put("1000000000foo002", "v_2"));
|
||||||
ASSERT_OK(Put("0000000000000bar", "random"));
|
ASSERT_OK(Put("0000000000000bar", "random"));
|
||||||
ASSERT_OK(Put("1000000000foo001", "v1"));
|
ASSERT_OK(Put("1000000000foo001", "v1"));
|
||||||
ASSERT_OK(Put("3000000000000bar", "bar_v"));
|
ASSERT_OK(Put("3000000000000bar", "bar_v"));
|
||||||
ASSERT_OK(Put("1000000000foo003", "v__3"));
|
ASSERT_OK(Put("1000000000foo003", "v__3"));
|
||||||
ASSERT_OK(Put("1000000000foo004", "v__4"));
|
ASSERT_OK(Put("1000000000foo004", "v__4"));
|
||||||
ASSERT_OK(Put("1000000000foo005", "v__5"));
|
ASSERT_OK(Put("1000000000foo005", "v__5"));
|
||||||
ASSERT_OK(Put("1000000000foo007", "v__7"));
|
ASSERT_OK(Put("1000000000foo007", "v__7"));
|
||||||
ASSERT_OK(Put("1000000000foo008", "v__8"));
|
ASSERT_OK(Put("1000000000foo008", "v__8"));
|
||||||
dbfull()->TEST_FlushMemTable();
|
dbfull()->TEST_FlushMemTable();
|
||||||
ASSERT_EQ("v1", Get("1000000000foo001"));
|
ASSERT_EQ("v1", Get("1000000000foo001"));
|
||||||
ASSERT_EQ("v__3", Get("1000000000foo003"));
|
ASSERT_EQ("v__3", Get("1000000000foo003"));
|
||||||
Iterator* iter = dbfull()->NewIterator(ro_);
|
Iterator* iter = dbfull()->NewIterator(ro_);
|
||||||
iter->Seek("1000000000foo000");
|
iter->Seek("1000000000foo000");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
||||||
ASSERT_EQ("v1", iter->value().ToString());
|
ASSERT_EQ("v1", iter->value().ToString());
|
||||||
|
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("1000000000foo002", iter->key().ToString());
|
ASSERT_EQ("1000000000foo002", iter->key().ToString());
|
||||||
ASSERT_EQ("v_2", iter->value().ToString());
|
ASSERT_EQ("v_2", iter->value().ToString());
|
||||||
|
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("1000000000foo003", iter->key().ToString());
|
ASSERT_EQ("1000000000foo003", iter->key().ToString());
|
||||||
ASSERT_EQ("v__3", iter->value().ToString());
|
ASSERT_EQ("v__3", iter->value().ToString());
|
||||||
|
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("1000000000foo004", iter->key().ToString());
|
ASSERT_EQ("1000000000foo004", iter->key().ToString());
|
||||||
ASSERT_EQ("v__4", iter->value().ToString());
|
ASSERT_EQ("v__4", iter->value().ToString());
|
||||||
|
|
||||||
iter->Seek("3000000000000bar");
|
iter->Seek("3000000000000bar");
|
||||||
ASSERT_TRUE(iter->Valid());
|
|
||||||
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
|
||||||
ASSERT_EQ("bar_v", iter->value().ToString());
|
|
||||||
|
|
||||||
iter->Seek("1000000000foo000");
|
|
||||||
ASSERT_TRUE(iter->Valid());
|
|
||||||
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
|
||||||
ASSERT_EQ("v1", iter->value().ToString());
|
|
||||||
|
|
||||||
iter->Seek("1000000000foo005");
|
|
||||||
ASSERT_TRUE(iter->Valid());
|
|
||||||
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
|
||||||
ASSERT_EQ("v__5", iter->value().ToString());
|
|
||||||
|
|
||||||
iter->Seek("1000000000foo006");
|
|
||||||
ASSERT_TRUE(iter->Valid());
|
|
||||||
ASSERT_EQ("1000000000foo007", iter->key().ToString());
|
|
||||||
ASSERT_EQ("v__7", iter->value().ToString());
|
|
||||||
|
|
||||||
iter->Seek("1000000000foo008");
|
|
||||||
ASSERT_TRUE(iter->Valid());
|
|
||||||
ASSERT_EQ("1000000000foo008", iter->key().ToString());
|
|
||||||
ASSERT_EQ("v__8", iter->value().ToString());
|
|
||||||
|
|
||||||
if (total_order == 0) {
|
|
||||||
iter->Seek("1000000000foo009");
|
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||||
}
|
ASSERT_EQ("bar_v", iter->value().ToString());
|
||||||
|
|
||||||
// Test Bloom Filter
|
iter->Seek("1000000000foo000");
|
||||||
if (bloom_bits > 0) {
|
ASSERT_TRUE(iter->Valid());
|
||||||
if (!total_order) {
|
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
||||||
// Neither key nor value should exist.
|
ASSERT_EQ("v1", iter->value().ToString());
|
||||||
expect_bloom_not_match = true;
|
|
||||||
iter->Seek("2not000000000bar");
|
iter->Seek("1000000000foo005");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
|
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
||||||
expect_bloom_not_match = false;
|
ASSERT_EQ("v__5", iter->value().ToString());
|
||||||
} else {
|
|
||||||
expect_bloom_not_match = true;
|
iter->Seek("1000000000foo006");
|
||||||
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
|
ASSERT_TRUE(iter->Valid());
|
||||||
expect_bloom_not_match = false;
|
ASSERT_EQ("1000000000foo007", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__7", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("1000000000foo008");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo008", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__8", iter->value().ToString());
|
||||||
|
|
||||||
|
if (total_order == 0) {
|
||||||
|
iter->Seek("1000000000foo009");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
delete iter;
|
// Test Bloom Filter
|
||||||
|
if (bloom_bits > 0) {
|
||||||
|
if (!total_order) {
|
||||||
|
// Neither key nor value should exist.
|
||||||
|
expect_bloom_not_match = true;
|
||||||
|
iter->Seek("2not000000000bar");
|
||||||
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
|
||||||
|
expect_bloom_not_match = false;
|
||||||
|
} else {
|
||||||
|
expect_bloom_not_match = true;
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
|
||||||
|
expect_bloom_not_match = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete iter;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -582,165 +592,173 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(PlainTableDBTest, HashBucketConflict) {
|
TEST(PlainTableDBTest, HashBucketConflict) {
|
||||||
for (unsigned char i = 1; i <= 3; i++) {
|
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||||
Options options = CurrentOptions();
|
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||||
options.create_if_missing = true;
|
for (unsigned char i = 1; i <= 3; i++) {
|
||||||
// Set only one bucket to force bucket conflict.
|
Options options = CurrentOptions();
|
||||||
// Test index interval for the same prefix to be 1, 2 and 4
|
options.create_if_missing = true;
|
||||||
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
|
// Set only one bucket to force bucket conflict.
|
||||||
DestroyAndReopen(&options);
|
// Test index interval for the same prefix to be 1, 2 and 4
|
||||||
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
options.table_factory.reset(
|
||||||
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
|
||||||
ASSERT_OK(Put("5000000000000fo2", "v"));
|
DestroyAndReopen(&options);
|
||||||
ASSERT_OK(Put("2000000000000fo0", "v3"));
|
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
||||||
ASSERT_OK(Put("2000000000000fo1", "v4"));
|
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
||||||
ASSERT_OK(Put("2000000000000fo2", "v"));
|
ASSERT_OK(Put("5000000000000fo2", "v"));
|
||||||
ASSERT_OK(Put("2000000000000fo3", "v"));
|
ASSERT_OK(Put("2000000000000fo0", "v3"));
|
||||||
|
ASSERT_OK(Put("2000000000000fo1", "v4"));
|
||||||
|
ASSERT_OK(Put("2000000000000fo2", "v"));
|
||||||
|
ASSERT_OK(Put("2000000000000fo3", "v"));
|
||||||
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
|
||||||
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
||||||
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
||||||
ASSERT_EQ("v3", Get("2000000000000fo0"));
|
ASSERT_EQ("v3", Get("2000000000000fo0"));
|
||||||
ASSERT_EQ("v4", Get("2000000000000fo1"));
|
ASSERT_EQ("v4", Get("2000000000000fo1"));
|
||||||
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
|
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
|
||||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
|
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
|
||||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
|
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
|
||||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
|
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
|
||||||
|
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
Iterator* iter = dbfull()->NewIterator(ro);
|
Iterator* iter = dbfull()->NewIterator(ro);
|
||||||
|
|
||||||
iter->Seek("5000000000000fo0");
|
iter->Seek("5000000000000fo0");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("5000000000000fo1");
|
iter->Seek("5000000000000fo1");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000fo0");
|
iter->Seek("2000000000000fo0");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000fo1");
|
iter->Seek("2000000000000fo1");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000bar");
|
iter->Seek("2000000000000bar");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("5000000000000bar");
|
iter->Seek("5000000000000bar");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000fo8");
|
iter->Seek("2000000000000fo8");
|
||||||
ASSERT_TRUE(!iter->Valid() ||
|
ASSERT_TRUE(!iter->Valid() ||
|
||||||
options.comparator->Compare(iter->key(), "20000001") > 0);
|
options.comparator->Compare(iter->key(), "20000001") > 0);
|
||||||
|
|
||||||
iter->Seek("5000000000000fo8");
|
iter->Seek("5000000000000fo8");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
iter->Seek("1000000000000fo2");
|
iter->Seek("1000000000000fo2");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
iter->Seek("3000000000000fo2");
|
iter->Seek("3000000000000fo2");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
iter->Seek("8000000000000fo2");
|
iter->Seek("8000000000000fo2");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
delete iter;
|
delete iter;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
|
TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
|
||||||
for (unsigned char i = 1; i <= 3; i++) {
|
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||||
Options options = CurrentOptions();
|
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||||
options.create_if_missing = true;
|
for (unsigned char i = 1; i <= 3; i++) {
|
||||||
SimpleSuffixReverseComparator comp;
|
Options options = CurrentOptions();
|
||||||
options.comparator = ∁
|
options.create_if_missing = true;
|
||||||
// Set only one bucket to force bucket conflict.
|
SimpleSuffixReverseComparator comp;
|
||||||
// Test index interval for the same prefix to be 1, 2 and 4
|
options.comparator = ∁
|
||||||
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
|
// Set only one bucket to force bucket conflict.
|
||||||
DestroyAndReopen(&options);
|
// Test index interval for the same prefix to be 1, 2 and 4
|
||||||
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
options.table_factory.reset(
|
||||||
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
|
||||||
ASSERT_OK(Put("5000000000000fo2", "v"));
|
DestroyAndReopen(&options);
|
||||||
ASSERT_OK(Put("2000000000000fo0", "v3"));
|
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
||||||
ASSERT_OK(Put("2000000000000fo1", "v4"));
|
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
||||||
ASSERT_OK(Put("2000000000000fo2", "v"));
|
ASSERT_OK(Put("5000000000000fo2", "v"));
|
||||||
ASSERT_OK(Put("2000000000000fo3", "v"));
|
ASSERT_OK(Put("2000000000000fo0", "v3"));
|
||||||
|
ASSERT_OK(Put("2000000000000fo1", "v4"));
|
||||||
|
ASSERT_OK(Put("2000000000000fo2", "v"));
|
||||||
|
ASSERT_OK(Put("2000000000000fo3", "v"));
|
||||||
|
|
||||||
dbfull()->TEST_FlushMemTable();
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
|
||||||
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
||||||
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
||||||
ASSERT_EQ("v3", Get("2000000000000fo0"));
|
ASSERT_EQ("v3", Get("2000000000000fo0"));
|
||||||
ASSERT_EQ("v4", Get("2000000000000fo1"));
|
ASSERT_EQ("v4", Get("2000000000000fo1"));
|
||||||
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
|
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
|
||||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
|
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
|
||||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
|
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
|
||||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
|
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
|
||||||
|
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
Iterator* iter = dbfull()->NewIterator(ro);
|
Iterator* iter = dbfull()->NewIterator(ro);
|
||||||
|
|
||||||
iter->Seek("5000000000000fo1");
|
iter->Seek("5000000000000fo1");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("5000000000000fo1");
|
iter->Seek("5000000000000fo1");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000fo1");
|
iter->Seek("2000000000000fo1");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||||
iter->Next();
|
iter->Next();
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000fo1");
|
iter->Seek("2000000000000fo1");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("2000000000000var");
|
iter->Seek("2000000000000var");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
|
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
|
||||||
|
|
||||||
iter->Seek("5000000000000var");
|
iter->Seek("5000000000000var");
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
|
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
|
||||||
|
|
||||||
std::string seek_key = "2000000000000bar";
|
std::string seek_key = "2000000000000bar";
|
||||||
iter->Seek(seek_key);
|
iter->Seek(seek_key);
|
||||||
ASSERT_TRUE(!iter->Valid() ||
|
ASSERT_TRUE(!iter->Valid() ||
|
||||||
options.prefix_extractor->Transform(iter->key()) !=
|
options.prefix_extractor->Transform(iter->key()) !=
|
||||||
options.prefix_extractor->Transform(seek_key));
|
options.prefix_extractor->Transform(seek_key));
|
||||||
|
|
||||||
iter->Seek("1000000000000fo2");
|
iter->Seek("1000000000000fo2");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
iter->Seek("3000000000000fo2");
|
iter->Seek("3000000000000fo2");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
iter->Seek("8000000000000fo2");
|
iter->Seek("8000000000000fo2");
|
||||||
ASSERT_TRUE(!iter->Valid());
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
|
||||||
delete iter;
|
delete iter;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
|
|||||||
DEFINE_int32(skiplist_height, 4, "");
|
DEFINE_int32(skiplist_height, 4, "");
|
||||||
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
|
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
|
||||||
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
|
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
|
||||||
|
DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
|
||||||
DEFINE_int32(value_size, 40, "");
|
DEFINE_int32(value_size, 40, "");
|
||||||
|
|
||||||
// Path to the database on file system
|
// Path to the database on file system
|
||||||
@ -147,6 +148,8 @@ class PrefixTest {
|
|||||||
|
|
||||||
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
|
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
|
||||||
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
|
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
|
||||||
|
options.memtable_prefix_bloom_huge_page_tlb_size =
|
||||||
|
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
|
||||||
|
|
||||||
Status s = DB::Open(options, kDbName, &db);
|
Status s = DB::Open(options, kDbName, &db);
|
||||||
ASSERT_OK(s);
|
ASSERT_OK(s);
|
||||||
@ -171,6 +174,10 @@ class PrefixTest {
|
|||||||
options.memtable_factory.reset(
|
options.memtable_factory.reset(
|
||||||
NewHashLinkListRepFactory(bucket_count));
|
NewHashLinkListRepFactory(bucket_count));
|
||||||
return true;
|
return true;
|
||||||
|
case kHashLinkListHugePageTlb:
|
||||||
|
options.memtable_factory.reset(
|
||||||
|
NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
|
||||||
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -189,6 +196,7 @@ class PrefixTest {
|
|||||||
kBegin,
|
kBegin,
|
||||||
kHashSkipList,
|
kHashSkipList,
|
||||||
kHashLinkList,
|
kHashLinkList,
|
||||||
|
kHashLinkListHugePageTlb,
|
||||||
kEnd
|
kEnd
|
||||||
};
|
};
|
||||||
int option_config_;
|
int option_config_;
|
||||||
|
@ -223,8 +223,13 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
|
|||||||
// The factory is to create memtables with a hashed linked list:
|
// The factory is to create memtables with a hashed linked list:
|
||||||
// it contains a fixed array of buckets, each pointing to a sorted single
|
// it contains a fixed array of buckets, each pointing to a sorted single
|
||||||
// linked list (null if the bucket is empty).
|
// linked list (null if the bucket is empty).
|
||||||
// bucket_count: number of fixed array buckets
|
// @bucket_count: number of fixed array buckets
|
||||||
|
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
|
||||||
|
// Otherwise from huge page TLB. The user needs to reserve
|
||||||
|
// huge pages for it to be allocated, like:
|
||||||
|
// sysctl -w vm.nr_hugepages=20
|
||||||
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||||
extern MemTableRepFactory* NewHashLinkListRepFactory(
|
extern MemTableRepFactory* NewHashLinkListRepFactory(
|
||||||
size_t bucket_count = 50000);
|
size_t bucket_count = 50000, size_t huge_page_tlb_size = 2 * 1024 * 1024);
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -719,6 +719,14 @@ struct Options {
|
|||||||
// number of hash probes per key
|
// number of hash probes per key
|
||||||
uint32_t memtable_prefix_bloom_probes;
|
uint32_t memtable_prefix_bloom_probes;
|
||||||
|
|
||||||
|
// Page size for huge page TLB for bloom in memtable. If <=0, not allocate
|
||||||
|
// from huge page TLB but from malloc.
|
||||||
|
// Need to reserve huge pages for it to be allocated. For example:
|
||||||
|
// sysctl -w vm.nr_hugepages=20
|
||||||
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||||
|
|
||||||
|
size_t memtable_prefix_bloom_huge_page_tlb_size;
|
||||||
|
|
||||||
// Control locality of bloom filter probes to improve cache miss rate.
|
// Control locality of bloom filter probes to improve cache miss rate.
|
||||||
// This option only applies to memtable prefix bloom and plaintable
|
// This option only applies to memtable prefix bloom and plaintable
|
||||||
// prefix bloom. It essentially limits the max number of cache lines each
|
// prefix bloom. It essentially limits the max number of cache lines each
|
||||||
|
@ -97,12 +97,19 @@ extern TableFactory* NewBlockBasedTableFactory(
|
|||||||
// in the hash table
|
// in the hash table
|
||||||
// @index_sparseness: inside each prefix, need to build one index record for how
|
// @index_sparseness: inside each prefix, need to build one index record for how
|
||||||
// many keys for binary search inside each hash bucket.
|
// many keys for binary search inside each hash bucket.
|
||||||
|
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
|
||||||
|
// Otherwise from huge page TLB. The user needs to reserve
|
||||||
|
// huge pages for it to be allocated, like:
|
||||||
|
// sysctl -w vm.nr_hugepages=20
|
||||||
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||||
|
|
||||||
const uint32_t kPlainTableVariableLength = 0;
|
const uint32_t kPlainTableVariableLength = 0;
|
||||||
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
|
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
|
||||||
kPlainTableVariableLength,
|
kPlainTableVariableLength,
|
||||||
int bloom_bits_per_prefix = 10,
|
int bloom_bits_per_prefix = 10,
|
||||||
double hash_table_ratio = 0.75,
|
double hash_table_ratio = 0.75,
|
||||||
size_t index_sparseness = 16);
|
size_t index_sparseness = 16,
|
||||||
|
size_t huge_page_tlb_size = 0);
|
||||||
|
|
||||||
// -- Plain Table
|
// -- Plain Table
|
||||||
// This factory of plain table ignores Options.prefix_extractor and assumes no
|
// This factory of plain table ignores Options.prefix_extractor and assumes no
|
||||||
@ -116,9 +123,15 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
|
|||||||
// disable it by passing a zero.
|
// disable it by passing a zero.
|
||||||
// @index_sparseness: need to build one index record for how many keys for
|
// @index_sparseness: need to build one index record for how many keys for
|
||||||
// binary search.
|
// binary search.
|
||||||
|
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
|
||||||
|
// Otherwise from huge page TLB. The user needs to reserve
|
||||||
|
// huge pages for it to be allocated, like:
|
||||||
|
// sysctl -w vm.nr_hugepages=20
|
||||||
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||||
extern TableFactory* NewTotalOrderPlainTableFactory(
|
extern TableFactory* NewTotalOrderPlainTableFactory(
|
||||||
uint32_t user_key_len = kPlainTableVariableLength,
|
uint32_t user_key_len = kPlainTableVariableLength,
|
||||||
int bloom_bits_per_key = 0, size_t index_sparseness = 16);
|
int bloom_bits_per_key = 0, size_t index_sparseness = 16,
|
||||||
|
size_t huge_page_tlb_size = 0);
|
||||||
|
|
||||||
// A base class for table factories.
|
// A base class for table factories.
|
||||||
class TableFactory {
|
class TableFactory {
|
||||||
|
@ -21,7 +21,8 @@ Status PlainTableFactory::NewTableReader(const Options& options,
|
|||||||
unique_ptr<TableReader>* table) const {
|
unique_ptr<TableReader>* table) const {
|
||||||
return PlainTableReader::Open(options, soptions, icomp, std::move(file),
|
return PlainTableReader::Open(options, soptions, icomp, std::move(file),
|
||||||
file_size, table, bloom_bits_per_key_,
|
file_size, table, bloom_bits_per_key_,
|
||||||
hash_table_ratio_, index_sparseness_);
|
hash_table_ratio_, index_sparseness_,
|
||||||
|
huge_page_tlb_size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
TableBuilder* PlainTableFactory::NewTableBuilder(
|
TableBuilder* PlainTableFactory::NewTableBuilder(
|
||||||
@ -33,16 +34,19 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
|
|||||||
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
|
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
|
||||||
int bloom_bits_per_key,
|
int bloom_bits_per_key,
|
||||||
double hash_table_ratio,
|
double hash_table_ratio,
|
||||||
size_t index_sparseness) {
|
size_t index_sparseness,
|
||||||
|
size_t huge_page_tlb_size) {
|
||||||
return new PlainTableFactory(user_key_len, bloom_bits_per_key,
|
return new PlainTableFactory(user_key_len, bloom_bits_per_key,
|
||||||
hash_table_ratio, index_sparseness);
|
hash_table_ratio, index_sparseness,
|
||||||
|
huge_page_tlb_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
|
extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
|
||||||
int bloom_bits_per_key,
|
int bloom_bits_per_key,
|
||||||
size_t index_sparseness) {
|
size_t index_sparseness,
|
||||||
|
size_t huge_page_tlb_size) {
|
||||||
return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
|
return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
|
||||||
index_sparseness);
|
index_sparseness, huge_page_tlb_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -54,14 +54,19 @@ class PlainTableFactory : public TableFactory {
|
|||||||
// inside the same prefix. It will be the maximum number of linear search
|
// inside the same prefix. It will be the maximum number of linear search
|
||||||
// required after hash and binary search.
|
// required after hash and binary search.
|
||||||
// index_sparseness = 0 means index for every key.
|
// index_sparseness = 0 means index for every key.
|
||||||
|
// huge_page_tlb_size determines whether to allocate hash indexes from huge
|
||||||
|
// page TLB and the page size if allocating from there. See comments of
|
||||||
|
// Arena::AllocateAligned() for details.
|
||||||
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
|
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
|
||||||
int bloom_bits_per_key = 0,
|
int bloom_bits_per_key = 0,
|
||||||
double hash_table_ratio = 0.75,
|
double hash_table_ratio = 0.75,
|
||||||
size_t index_sparseness = 16)
|
size_t index_sparseness = 16,
|
||||||
|
size_t huge_page_tlb_size = 2 * 1024 * 1024)
|
||||||
: user_key_len_(user_key_len),
|
: user_key_len_(user_key_len),
|
||||||
bloom_bits_per_key_(bloom_bits_per_key),
|
bloom_bits_per_key_(bloom_bits_per_key),
|
||||||
hash_table_ratio_(hash_table_ratio),
|
hash_table_ratio_(hash_table_ratio),
|
||||||
index_sparseness_(index_sparseness) {}
|
index_sparseness_(index_sparseness),
|
||||||
|
huge_page_tlb_size_(huge_page_tlb_size) {}
|
||||||
const char* Name() const override { return "PlainTable"; }
|
const char* Name() const override { return "PlainTable"; }
|
||||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||||
const InternalKeyComparator& internal_comparator,
|
const InternalKeyComparator& internal_comparator,
|
||||||
@ -80,6 +85,7 @@ class PlainTableFactory : public TableFactory {
|
|||||||
int bloom_bits_per_key_;
|
int bloom_bits_per_key_;
|
||||||
double hash_table_ratio_;
|
double hash_table_ratio_;
|
||||||
size_t index_sparseness_;
|
size_t index_sparseness_;
|
||||||
|
size_t huge_page_tlb_size_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
#include "table/two_level_iterator.h"
|
#include "table/two_level_iterator.h"
|
||||||
#include "table/plain_table_factory.h"
|
#include "table/plain_table_factory.h"
|
||||||
|
|
||||||
|
#include "util/arena.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/dynamic_bloom.h"
|
#include "util/dynamic_bloom.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
@ -94,7 +95,8 @@ PlainTableReader::PlainTableReader(
|
|||||||
const Options& options, unique_ptr<RandomAccessFile>&& file,
|
const Options& options, unique_ptr<RandomAccessFile>&& file,
|
||||||
const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
|
const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
|
||||||
uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
|
uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
|
||||||
size_t index_sparseness, const TableProperties* table_properties)
|
size_t index_sparseness, const TableProperties* table_properties,
|
||||||
|
size_t huge_page_tlb_size)
|
||||||
: options_(options),
|
: options_(options),
|
||||||
soptions_(storage_options),
|
soptions_(storage_options),
|
||||||
file_(std::move(file)),
|
file_(std::move(file)),
|
||||||
@ -105,19 +107,23 @@ PlainTableReader::PlainTableReader(
|
|||||||
kIndexIntervalForSamePrefixKeys(index_sparseness),
|
kIndexIntervalForSamePrefixKeys(index_sparseness),
|
||||||
table_properties_(nullptr),
|
table_properties_(nullptr),
|
||||||
data_end_offset_(table_properties->data_size),
|
data_end_offset_(table_properties->data_size),
|
||||||
user_key_len_(table_properties->fixed_key_len) {
|
user_key_len_(table_properties->fixed_key_len),
|
||||||
|
huge_page_tlb_size_(huge_page_tlb_size) {
|
||||||
assert(kHashTableRatio >= 0.0);
|
assert(kHashTableRatio >= 0.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
PlainTableReader::~PlainTableReader() {
|
PlainTableReader::~PlainTableReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
Status PlainTableReader::Open(
|
Status PlainTableReader::Open(const Options& options,
|
||||||
const Options& options, const EnvOptions& soptions,
|
const EnvOptions& soptions,
|
||||||
const InternalKeyComparator& internal_comparator,
|
const InternalKeyComparator& internal_comparator,
|
||||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
unique_ptr<RandomAccessFile>&& file,
|
||||||
unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
|
uint64_t file_size,
|
||||||
double hash_table_ratio, size_t index_sparseness) {
|
unique_ptr<TableReader>* table_reader,
|
||||||
|
const int bloom_bits_per_key,
|
||||||
|
double hash_table_ratio, size_t index_sparseness,
|
||||||
|
size_t huge_page_tlb_size) {
|
||||||
assert(options.allow_mmap_reads);
|
assert(options.allow_mmap_reads);
|
||||||
|
|
||||||
if (file_size > kMaxFileSize) {
|
if (file_size > kMaxFileSize) {
|
||||||
@ -133,7 +139,8 @@ Status PlainTableReader::Open(
|
|||||||
|
|
||||||
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
|
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
|
||||||
options, std::move(file), soptions, internal_comparator, file_size,
|
options, std::move(file), soptions, internal_comparator, file_size,
|
||||||
bloom_bits_per_key, hash_table_ratio, index_sparseness, props));
|
bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
|
||||||
|
huge_page_tlb_size));
|
||||||
|
|
||||||
// -- Populate Index
|
// -- Populate Index
|
||||||
s = new_reader->PopulateIndex(props);
|
s = new_reader->PopulateIndex(props);
|
||||||
@ -264,12 +271,11 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
|
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
|
||||||
index_.reset();
|
|
||||||
|
|
||||||
if (options_.prefix_extractor.get() != nullptr) {
|
if (options_.prefix_extractor.get() != nullptr) {
|
||||||
uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
|
uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
|
||||||
if (bloom_total_bits > 0) {
|
if (bloom_total_bits > 0) {
|
||||||
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality));
|
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
|
||||||
|
6, nullptr, huge_page_tlb_size_));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,7 +287,6 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
|
|||||||
double hash_table_size_multipier = 1.0 / kHashTableRatio;
|
double hash_table_size_multipier = 1.0 / kHashTableRatio;
|
||||||
index_size_ = num_prefixes * hash_table_size_multipier + 1;
|
index_size_ = num_prefixes * hash_table_size_multipier + 1;
|
||||||
}
|
}
|
||||||
index_.reset(new uint32_t[index_size_]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
|
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
|
||||||
@ -325,7 +330,12 @@ void PlainTableReader::FillIndexes(
|
|||||||
const std::vector<uint32_t>& entries_per_bucket) {
|
const std::vector<uint32_t>& entries_per_bucket) {
|
||||||
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
|
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
|
||||||
kSubIndexSize);
|
kSubIndexSize);
|
||||||
sub_index_.reset(new char[kSubIndexSize]);
|
auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
|
||||||
|
char* allocated =
|
||||||
|
arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
|
||||||
|
index_ = reinterpret_cast<uint32_t*>(allocated);
|
||||||
|
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
|
||||||
|
|
||||||
size_t sub_index_offset = 0;
|
size_t sub_index_offset = 0;
|
||||||
for (int i = 0; i < index_size_; i++) {
|
for (int i = 0; i < index_size_; i++) {
|
||||||
uint32_t num_keys_for_bucket = entries_per_bucket[i];
|
uint32_t num_keys_for_bucket = entries_per_bucket[i];
|
||||||
@ -390,7 +400,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
|
|||||||
if (IsTotalOrderMode()) {
|
if (IsTotalOrderMode()) {
|
||||||
uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
|
uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
|
||||||
if (num_bloom_bits > 0) {
|
if (num_bloom_bits > 0) {
|
||||||
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality));
|
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
|
||||||
|
nullptr, huge_page_tlb_size_));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include "rocksdb/table_properties.h"
|
#include "rocksdb/table_properties.h"
|
||||||
#include "table/table_reader.h"
|
#include "table/table_reader.h"
|
||||||
#include "table/plain_table_factory.h"
|
#include "table/plain_table_factory.h"
|
||||||
|
#include "util/arena.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
@ -50,7 +51,7 @@ class PlainTableReader: public TableReader {
|
|||||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table,
|
unique_ptr<TableReader>* table,
|
||||||
const int bloom_bits_per_key, double hash_table_ratio,
|
const int bloom_bits_per_key, double hash_table_ratio,
|
||||||
size_t index_sparseness);
|
size_t index_sparseness, size_t huge_page_tlb_size);
|
||||||
|
|
||||||
bool PrefixMayMatch(const Slice& internal_prefix);
|
bool PrefixMayMatch(const Slice& internal_prefix);
|
||||||
|
|
||||||
@ -74,7 +75,8 @@ class PlainTableReader: public TableReader {
|
|||||||
const InternalKeyComparator& internal_comparator,
|
const InternalKeyComparator& internal_comparator,
|
||||||
uint64_t file_size, int bloom_num_bits,
|
uint64_t file_size, int bloom_num_bits,
|
||||||
double hash_table_ratio, size_t index_sparseness,
|
double hash_table_ratio, size_t index_sparseness,
|
||||||
const TableProperties* table_properties);
|
const TableProperties* table_properties,
|
||||||
|
size_t huge_page_tlb_size);
|
||||||
virtual ~PlainTableReader();
|
virtual ~PlainTableReader();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -136,9 +138,9 @@ class PlainTableReader: public TableReader {
|
|||||||
// For more details about the in-memory index, please refer to:
|
// For more details about the in-memory index, please refer to:
|
||||||
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
||||||
// #wiki-in-memory-index-format
|
// #wiki-in-memory-index-format
|
||||||
std::unique_ptr<uint32_t[]> index_;
|
uint32_t* index_;
|
||||||
int index_size_ = 0;
|
int index_size_ = 0;
|
||||||
std::unique_ptr<char[]> sub_index_;
|
char* sub_index_;
|
||||||
|
|
||||||
Options options_;
|
Options options_;
|
||||||
const EnvOptions& soptions_;
|
const EnvOptions& soptions_;
|
||||||
@ -159,6 +161,7 @@ class PlainTableReader: public TableReader {
|
|||||||
const size_t kIndexIntervalForSamePrefixKeys = 16;
|
const size_t kIndexIntervalForSamePrefixKeys = 16;
|
||||||
// Bloom filter is used to rule out non-existent key
|
// Bloom filter is used to rule out non-existent key
|
||||||
unique_ptr<DynamicBloom> bloom_;
|
unique_ptr<DynamicBloom> bloom_;
|
||||||
|
Arena arena_;
|
||||||
|
|
||||||
std::shared_ptr<const TableProperties> table_properties_;
|
std::shared_ptr<const TableProperties> table_properties_;
|
||||||
// data_start_offset_ and data_end_offset_ defines the range of the
|
// data_start_offset_ and data_end_offset_ defines the range of the
|
||||||
@ -166,6 +169,7 @@ class PlainTableReader: public TableReader {
|
|||||||
const uint32_t data_start_offset_ = 0;
|
const uint32_t data_start_offset_ = 0;
|
||||||
const uint32_t data_end_offset_;
|
const uint32_t data_end_offset_;
|
||||||
const size_t user_key_len_;
|
const size_t user_key_len_;
|
||||||
|
const size_t huge_page_tlb_size_;
|
||||||
|
|
||||||
static const size_t kNumInternalBytes = 8;
|
static const size_t kNumInternalBytes = 8;
|
||||||
static const uint32_t kSubIndexMask = 0x80000000;
|
static const uint32_t kSubIndexMask = 0x80000000;
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#include "util/arena.h"
|
#include "util/arena.h"
|
||||||
|
#include <sys/mman.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -38,6 +39,13 @@ Arena::~Arena() {
|
|||||||
for (const auto& block : blocks_) {
|
for (const auto& block : blocks_) {
|
||||||
delete[] block;
|
delete[] block;
|
||||||
}
|
}
|
||||||
|
for (const auto& mmap_info : huge_blocks_) {
|
||||||
|
auto ret = munmap(mmap_info.addr_, mmap_info.length_);
|
||||||
|
if (ret != 0) {
|
||||||
|
// TODO(sdong): Better handling
|
||||||
|
perror("munmap");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* Arena::AllocateFallback(size_t bytes, bool aligned) {
|
char* Arena::AllocateFallback(size_t bytes, bool aligned) {
|
||||||
@ -63,9 +71,29 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* Arena::AllocateAligned(size_t bytes) {
|
char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) {
|
||||||
assert((kAlignUnit & (kAlignUnit - 1)) ==
|
assert((kAlignUnit & (kAlignUnit - 1)) ==
|
||||||
0); // Pointer size should be a power of 2
|
0); // Pointer size should be a power of 2
|
||||||
|
|
||||||
|
#ifdef OS_LINUX
|
||||||
|
if (huge_page_tlb_size > 0 && bytes > 0) {
|
||||||
|
// Allocate from a huge page TBL table.
|
||||||
|
size_t reserved_size =
|
||||||
|
((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size;
|
||||||
|
assert(reserved_size >= bytes);
|
||||||
|
void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
|
||||||
|
(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
|
||||||
|
if (addr == MAP_FAILED) {
|
||||||
|
perror("mmap");
|
||||||
|
// fail back to malloc
|
||||||
|
} else {
|
||||||
|
blocks_memory_ += reserved_size;
|
||||||
|
huge_blocks_.push_back(MmapInfo(addr, reserved_size));
|
||||||
|
return reinterpret_cast<char*>(addr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
size_t current_mod =
|
size_t current_mod =
|
||||||
reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
|
reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
|
||||||
size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
|
size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
|
||||||
|
17
util/arena.h
17
util/arena.h
@ -34,7 +34,14 @@ class Arena {
|
|||||||
|
|
||||||
char* Allocate(size_t bytes);
|
char* Allocate(size_t bytes);
|
||||||
|
|
||||||
char* AllocateAligned(size_t bytes);
|
// huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size
|
||||||
|
// of the huge page TLB. Bytes will be rounded up to multiple and 2MB and
|
||||||
|
// allocate huge pages through mmap anonymous option with huge page on.
|
||||||
|
// The extra space allocated will be wasted. To enable it, need to reserve
|
||||||
|
// huge pages for it to be allocated, like:
|
||||||
|
// sysctl -w vm.nr_hugepages=20
|
||||||
|
// See linux doc Documentation/vm/hugetlbpage.txt for details.
|
||||||
|
char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0);
|
||||||
|
|
||||||
// Returns an estimate of the total memory usage of data allocated
|
// Returns an estimate of the total memory usage of data allocated
|
||||||
// by the arena (exclude the space allocated but not yet used for future
|
// by the arena (exclude the space allocated but not yet used for future
|
||||||
@ -60,6 +67,14 @@ class Arena {
|
|||||||
// Array of new[] allocated memory blocks
|
// Array of new[] allocated memory blocks
|
||||||
typedef std::vector<char*> Blocks;
|
typedef std::vector<char*> Blocks;
|
||||||
Blocks blocks_;
|
Blocks blocks_;
|
||||||
|
|
||||||
|
struct MmapInfo {
|
||||||
|
void* addr_;
|
||||||
|
size_t length_;
|
||||||
|
|
||||||
|
MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
|
||||||
|
};
|
||||||
|
std::vector<MmapInfo> huge_blocks_;
|
||||||
size_t irregular_block_num = 0;
|
size_t irregular_block_num = 0;
|
||||||
|
|
||||||
// Stats for current active block.
|
// Stats for current active block.
|
||||||
|
@ -19,18 +19,19 @@ static uint32_t BloomHash(const Slice& key) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
DynamicBloom::DynamicBloom(uint32_t total_bits,
|
DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
|
||||||
uint32_t cl_per_block,
|
|
||||||
uint32_t num_probes,
|
uint32_t num_probes,
|
||||||
uint32_t (*hash_func)(const Slice& key))
|
uint32_t (*hash_func)(const Slice& key),
|
||||||
: kBlocked(cl_per_block > 0),
|
size_t huge_page_tlb_size)
|
||||||
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
|
: kBlocked(cl_per_block > 0),
|
||||||
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock
|
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
|
||||||
* kBitsPerBlock :
|
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
|
||||||
total_bits + 7) / 8 * 8),
|
kBitsPerBlock
|
||||||
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
|
: total_bits + 7) /
|
||||||
kNumProbes(num_probes),
|
8 * 8),
|
||||||
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
|
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
|
||||||
|
kNumProbes(num_probes),
|
||||||
|
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
|
||||||
assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
|
assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
|
||||||
assert(kNumProbes > 0);
|
assert(kNumProbes > 0);
|
||||||
|
|
||||||
@ -38,7 +39,9 @@ DynamicBloom::DynamicBloom(uint32_t total_bits,
|
|||||||
if (kBlocked) {
|
if (kBlocked) {
|
||||||
sz += CACHE_LINE_SIZE - 1;
|
sz += CACHE_LINE_SIZE - 1;
|
||||||
}
|
}
|
||||||
raw_ = new unsigned char[sz]();
|
raw_ = reinterpret_cast<unsigned char*>(
|
||||||
|
arena_.AllocateAligned(sz, huge_page_tlb_size));
|
||||||
|
memset(raw_, 0, sz);
|
||||||
if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
|
if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
|
||||||
data_ = raw_ + CACHE_LINE_SIZE -
|
data_ = raw_ + CACHE_LINE_SIZE -
|
||||||
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
|
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
|
||||||
|
@ -8,6 +8,8 @@
|
|||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
#include <util/arena.h>
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
class Slice;
|
class Slice;
|
||||||
@ -19,13 +21,17 @@ class DynamicBloom {
|
|||||||
// cl_per_block: block size in cache lines. When this is non-zero, a
|
// cl_per_block: block size in cache lines. When this is non-zero, a
|
||||||
// query/set is done within a block to improve cache locality.
|
// query/set is done within a block to improve cache locality.
|
||||||
// hash_func: customized hash function
|
// hash_func: customized hash function
|
||||||
|
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
|
||||||
|
// withi this page size. Need to reserve huge pages for
|
||||||
|
// it to be allocated, like:
|
||||||
|
// sysctl -w vm.nr_hugepages=20
|
||||||
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||||
explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
|
explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
|
||||||
uint32_t num_probes = 6,
|
uint32_t num_probes = 6,
|
||||||
uint32_t (*hash_func)(const Slice& key) = nullptr);
|
uint32_t (*hash_func)(const Slice& key) = nullptr,
|
||||||
|
size_t huge_page_tlb_size = 0);
|
||||||
|
|
||||||
~DynamicBloom() {
|
~DynamicBloom() {}
|
||||||
delete[] raw_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assuming single threaded access to this function.
|
// Assuming single threaded access to this function.
|
||||||
void Add(const Slice& key);
|
void Add(const Slice& key);
|
||||||
@ -49,6 +55,8 @@ class DynamicBloom {
|
|||||||
uint32_t (*hash_func_)(const Slice& key);
|
uint32_t (*hash_func_)(const Slice& key);
|
||||||
unsigned char* data_;
|
unsigned char* data_;
|
||||||
unsigned char* raw_;
|
unsigned char* raw_;
|
||||||
|
|
||||||
|
Arena arena_;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
|
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
|
||||||
|
@ -52,7 +52,8 @@ struct Node {
|
|||||||
class HashLinkListRep : public MemTableRep {
|
class HashLinkListRep : public MemTableRep {
|
||||||
public:
|
public:
|
||||||
HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
|
HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
|
||||||
const SliceTransform* transform, size_t bucket_size);
|
const SliceTransform* transform, size_t bucket_size,
|
||||||
|
size_t huge_page_tlb_size);
|
||||||
|
|
||||||
virtual KeyHandle Allocate(const size_t len, char** buf) override;
|
virtual KeyHandle Allocate(const size_t len, char** buf) override;
|
||||||
|
|
||||||
@ -308,13 +309,13 @@ class HashLinkListRep : public MemTableRep {
|
|||||||
|
|
||||||
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
|
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
|
||||||
Arena* arena, const SliceTransform* transform,
|
Arena* arena, const SliceTransform* transform,
|
||||||
size_t bucket_size)
|
size_t bucket_size, size_t huge_page_tlb_size)
|
||||||
: MemTableRep(arena),
|
: MemTableRep(arena),
|
||||||
bucket_size_(bucket_size),
|
bucket_size_(bucket_size),
|
||||||
transform_(transform),
|
transform_(transform),
|
||||||
compare_(compare) {
|
compare_(compare) {
|
||||||
char* mem = arena_->AllocateAligned(
|
char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
|
||||||
sizeof(port::AtomicPointer) * bucket_size);
|
huge_page_tlb_size);
|
||||||
|
|
||||||
buckets_ = new (mem) port::AtomicPointer[bucket_size];
|
buckets_ = new (mem) port::AtomicPointer[bucket_size];
|
||||||
|
|
||||||
@ -476,11 +477,13 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
|
|||||||
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
|
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
|
||||||
const MemTableRep::KeyComparator& compare, Arena* arena,
|
const MemTableRep::KeyComparator& compare, Arena* arena,
|
||||||
const SliceTransform* transform) {
|
const SliceTransform* transform) {
|
||||||
return new HashLinkListRep(compare, arena, transform, bucket_count_);
|
return new HashLinkListRep(compare, arena, transform, bucket_count_,
|
||||||
|
huge_page_tlb_size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) {
|
MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count,
|
||||||
return new HashLinkListRepFactory(bucket_count);
|
size_t huge_page_tlb_size) {
|
||||||
|
return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -14,8 +14,9 @@ namespace rocksdb {
|
|||||||
|
|
||||||
class HashLinkListRepFactory : public MemTableRepFactory {
|
class HashLinkListRepFactory : public MemTableRepFactory {
|
||||||
public:
|
public:
|
||||||
explicit HashLinkListRepFactory(size_t bucket_count)
|
explicit HashLinkListRepFactory(size_t bucket_count,
|
||||||
: bucket_count_(bucket_count) { }
|
size_t huge_page_tlb_size)
|
||||||
|
: bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {}
|
||||||
|
|
||||||
virtual ~HashLinkListRepFactory() {}
|
virtual ~HashLinkListRepFactory() {}
|
||||||
|
|
||||||
@ -29,6 +30,7 @@ class HashLinkListRepFactory : public MemTableRepFactory {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
const size_t bucket_count_;
|
const size_t bucket_count_;
|
||||||
|
const size_t huge_page_tlb_size_;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -317,6 +317,10 @@ Options::Dump(Logger* log) const
|
|||||||
memtable_prefix_bloom_bits);
|
memtable_prefix_bloom_bits);
|
||||||
Log(log, " Options.memtable_prefix_bloom_probes: %d",
|
Log(log, " Options.memtable_prefix_bloom_probes: %d",
|
||||||
memtable_prefix_bloom_probes);
|
memtable_prefix_bloom_probes);
|
||||||
|
Log(log, " Options.memtable_prefix_bloom_huge_page_tlb_size: %zu",
|
||||||
|
memtable_prefix_bloom_huge_page_tlb_size);
|
||||||
|
Log(log, " Options.bloom_locality: %d",
|
||||||
|
bloom_locality);
|
||||||
Log(log, " Options.max_successive_merges: %zd",
|
Log(log, " Options.max_successive_merges: %zd",
|
||||||
max_successive_merges);
|
max_successive_merges);
|
||||||
} // Options::Dump
|
} // Options::Dump
|
||||||
|
Loading…
x
Reference in New Issue
Block a user