Use malloc_usable_size() for accounting block cache size

Summary:
Currently, when we insert something into block cache, we say that the block cache capacity decreased by the size of the block. However, size of the block might be less than the actual memory used by this object. For example, 4.5KB block will actually use 8KB of memory. So even if we configure block cache to 10GB, our actually memory usage of block cache will be 20GB!

This problem showed up a lot in testing and just recently also showed up in MongoRocks production where we were using 30GB more memory than expected.

This diff will fix the problem. Instead of counting the block size, we will count memory used by the block. That way, a block cache configured to be 10GB will actually use only 10GB of memory.

I'm using non-portable function and I couldn't find info on portability on Google. However, it seems to work on Linux, which will cover majority of our use-cases.

Test Plan:
1. fill up mongo instance with 80GB of data
2. restart mongo with block cache size configured to 10GB
3. do a table scan in mongo
4. memory usage before the diff: 12GB. memory usage after the diff: 10.5GB

Reviewers: sdong, MarkCallaghan, rven, yhchiang

Reviewed By: yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D40635
This commit is contained in:
Igor Canadi 2015-06-26 11:48:09 -07:00
parent 4cbc4e6f88
commit 0a019d74a0
8 changed files with 47 additions and 14 deletions

View File

@ -21,6 +21,7 @@
* CompactRange() will now skip bottommost level compaction for level based compaction if there is no compaction filter, bottommost_level_compaction is introduced in CompactRangeOptions to control when it's possbile to skip bottommost level compaction. This mean that if you want the compaction to produce a single file you need to set bottommost_level_compaction to BottommostLevelCompaction::kForce.
* Add Cache.GetPinnedUsage() to get the size of memory occupied by entries that are in use by the system.
* DB:Open() will fail if the compression specified in Options is not linked with the binary. If you see this failure, recompile RocksDB with compression libraries present on your system. Also, previously our default compression was snappy. This behavior is now changed. Now, the default compression is snappy only if it's available on the system. If it isn't we change the default to kNoCompression.
* We changed how we account for memory used in block cache. Previously, we only counted the sum of block sizes currently present in block cache. Now, we count the actual memory usage of the blocks. For example, a block of size 4.5KB will use 8KB memory with jemalloc. This might decrease your memory usage and possibly decrease performance. Increase block cache size if you see this happening after an upgrade.
## 3.11.0 (5/19/2015)
### New Features

View File

@ -294,6 +294,18 @@ EOF
JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
fi
fi
# Test whether malloc_usable_size is available
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <malloc.h>
int main() {
size_t res = malloc_usable_size(0);
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
fi
fi
# TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning.

View File

@ -111,7 +111,7 @@ else
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"

View File

@ -87,7 +87,7 @@ else
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
CXXFLAGS+=" $CFLAGS"

View File

@ -359,7 +359,7 @@ void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
}
size_t Block::ApproximateMemoryUsage() const {
size_t usage = size();
size_t usage = usable_size();
if (hash_index_) {
usage += hash_index_->ApproximateMemoryUsage();
}

View File

@ -10,6 +10,9 @@
#pragma once
#include <stddef.h>
#include <stdint.h>
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
#include <malloc.h>
#endif
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
@ -37,6 +40,14 @@ class Block {
size_t size() const { return size_; }
const char* data() const { return data_; }
bool cachable() const { return contents_.cachable; }
size_t usable_size() const {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
if (contents_.allocation.get() != nullptr) {
return malloc_usable_size(contents_.allocation.get());
}
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return size_;
}
uint32_t NumRestarts() const;
CompressionType compression_type() const {
return contents_.compression_type;

View File

@ -703,8 +703,8 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
(end - r->compressed_cache_key_prefix));
// Insert into compressed block cache.
cache_handle = block_cache_compressed->Insert(key, block, block->size(),
&DeleteCachedBlock);
cache_handle = block_cache_compressed->Insert(
key, block, block->usable_size(), &DeleteCachedBlock);
block_cache_compressed->Release(cache_handle);
// Invalidate OS cache.

View File

@ -147,6 +147,8 @@ class BlockBasedTable::IndexReader {
// The size of the index.
virtual size_t size() const = 0;
// Memory usage of the index block
virtual size_t usable_size() const = 0;
// Report an approximation of how much memory has been used other than memory
// that was allocated in block cache.
@ -187,6 +189,9 @@ class BinarySearchIndexReader : public IndexReader {
}
virtual size_t size() const override { return index_block_->size(); }
virtual size_t usable_size() const override {
return index_block_->usable_size();
}
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
@ -295,6 +300,9 @@ class HashIndexReader : public IndexReader {
}
virtual size_t size() const override { return index_block_->size(); }
virtual size_t usable_size() const override {
return index_block_->usable_size();
}
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
@ -702,9 +710,9 @@ Status BlockBasedTable::GetDataBlockFromCache(
assert(block->value->compression_type() == kNoCompression);
if (block_cache != nullptr && block->value->cachable() &&
read_options.fill_cache) {
block->cache_handle =
block_cache->Insert(block_cache_key, block->value,
block->value->size(), &DeleteCachedEntry<Block>);
block->cache_handle = block_cache->Insert(block_cache_key, block->value,
block->value->usable_size(),
&DeleteCachedEntry<Block>);
assert(reinterpret_cast<Block*>(
block_cache->Value(block->cache_handle)) == block->value);
}
@ -747,7 +755,7 @@ Status BlockBasedTable::PutDataBlockToCache(
if (block_cache_compressed != nullptr && raw_block != nullptr &&
raw_block->cachable()) {
auto cache_handle = block_cache_compressed->Insert(
compressed_block_cache_key, raw_block, raw_block->size(),
compressed_block_cache_key, raw_block, raw_block->usable_size(),
&DeleteCachedEntry<Block>);
block_cache_compressed->Release(cache_handle);
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
@ -759,9 +767,9 @@ Status BlockBasedTable::PutDataBlockToCache(
// insert into uncompressed block cache
assert((block->value->compression_type() == kNoCompression));
if (block_cache != nullptr && block->value->cachable()) {
block->cache_handle =
block_cache->Insert(block_cache_key, block->value, block->value->size(),
&DeleteCachedEntry<Block>);
block->cache_handle = block_cache->Insert(block_cache_key, block->value,
block->value->usable_size(),
&DeleteCachedEntry<Block>);
RecordTick(statistics, BLOCK_CACHE_ADD);
assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) ==
block->value);
@ -913,8 +921,9 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
}
}
cache_handle = block_cache->Insert(key, index_reader, index_reader->size(),
&DeleteCachedEntry<IndexReader>);
cache_handle =
block_cache->Insert(key, index_reader, index_reader->usable_size(),
&DeleteCachedEntry<IndexReader>);
RecordTick(statistics, BLOCK_CACHE_ADD);
}