Adding NUMA support to db_bench tests

Summary:
Changes:
- Adding numa_aware flag to db_bench.cc
- Using numa.h library to bind memory and cpu of threads to a fixed NUMA node
Result: There seems to be no significant change in the micros/op time with numa_aware enabled. I also tried this with other implementations, including a combination of pthread_setaffinity_np, sched_setaffinity and set_mempolicy methods. It'd be great if someone could point out where I'm going wrong and if we can achieve a better micors/op.

Test Plan:
Ran db_bench tests using following command:
./db_bench --db=/mnt/tmp --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=134217728 --max_bytes_for_level_base=1073741824 --disable_wal=0 --wal_dir=/mnt/tmp --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --duration=300 --benchmarks=readwhilewriting --use_existing_db=1 --num=157286400 --threads=24 --writes_per_second=10240 --numa_aware=[False/True]

The tests were run in private devserver with 24 cores and the db was prepopulated using filluniquerandom test. The tests resulted in 0.145 us/op with numa_aware=False and 0.161 us/op with numa_aware=True.

Reviewers: sdong, yhchiang, ljin, igor

Reviewed By: ljin, igor

Subscribers: igor, leveldb

Differential Revision: https://reviews.facebook.net/D19353
This commit is contained in:
Radheshyam Balasundaram 2014-07-07 10:53:31 -07:00
parent 0bc5fa9f40
commit f0660d5253
3 changed files with 63 additions and 3 deletions

View File

@ -21,6 +21,7 @@
# -DLEVELDB_PLATFORM_NOATOMIC if it is not
# -DSNAPPY if the Snappy library is present
# -DLZ4 if the LZ4 library is present
# -DNUMA if the NUMA library is present
#
# Using gflags in rocksdb:
# Our project depends on gflags, which requires users to take some extra steps
@ -272,6 +273,17 @@ EOF
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
fi
# Test whether numa is available
$CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null <<EOF
#include <numa.h>
#inlcude <numaif.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
fi
# Test whether tcmalloc is available
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
int main() {}

View File

@ -55,22 +55,27 @@ GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
# location of numa
NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "

View File

@ -17,6 +17,11 @@ int main() {
}
#else
#ifdef NUMA
#include <numa.h>
#include <numaif.h>
#endif
#include <inttypes.h>
#include <cstddef>
#include <sys/types.h>
@ -173,6 +178,14 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
DEFINE_bool(histogram, false, "Print histogram of operation timings");
DEFINE_bool(enable_numa, false,
"Make operations aware of NUMA architecture and bind memory "
"and cpus corresponding to nodes together. In NUMA, memory "
"in same node as CPUs are closer when compared to memory in "
"other nodes. Reads can be faster when the process is bound to "
"CPU and memory of same node. Use \"$numactl --hardware\" command "
"to see NUMA memory architecture.");
DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
"Number of bytes to buffer in memtable before compacting");
@ -863,6 +876,18 @@ class Benchmark {
* num_)
/ 1048576.0));
fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
if (FLAGS_enable_numa) {
fprintf(stderr, "Running in NUMA enabled mode.\n");
#ifndef NUMA
fprintf(stderr, "NUMA is not defined in the system.\n");
exit(1);
#else
if (numa_available() == -1) {
fprintf(stderr, "NUMA is not supported by the system.\n");
exit(1);
}
#endif
}
switch (FLAGS_compression_type_e) {
case rocksdb::kNoCompression:
fprintf(stdout, "Compression: none\n");
@ -1348,7 +1373,25 @@ class Benchmark {
shared.start = false;
ThreadArg* arg = new ThreadArg[n];
for (int i = 0; i < n; i++) {
#ifdef NUMA
if (FLAGS_enable_numa) {
// Performs a local allocation of memory to threads in numa node.
int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
numa_exit_on_error = 1;
int numa_node = i % n_nodes;
bitmask* nodes = numa_allocate_nodemask();
numa_bitmask_clearall(nodes);
numa_bitmask_setbit(nodes, numa_node);
// numa_bind() call binds the process to the node and these
// properties are passed on to the thread that is created in
// StartThread method called later in the loop.
numa_bind(nodes);
numa_set_strict(1);
numa_free_nodemask(nodes);
}
#endif
arg[i].bm = this;
arg[i].method = method;
arg[i].shared = &shared;