Adding NUMA support to db_bench tests
Summary: Changes: - Adding numa_aware flag to db_bench.cc - Using numa.h library to bind memory and cpu of threads to a fixed NUMA node Result: There seems to be no significant change in the micros/op time with numa_aware enabled. I also tried this with other implementations, including a combination of pthread_setaffinity_np, sched_setaffinity and set_mempolicy methods. It'd be great if someone could point out where I'm going wrong and if we can achieve a better micors/op. Test Plan: Ran db_bench tests using following command: ./db_bench --db=/mnt/tmp --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=134217728 --max_bytes_for_level_base=1073741824 --disable_wal=0 --wal_dir=/mnt/tmp --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --duration=300 --benchmarks=readwhilewriting --use_existing_db=1 --num=157286400 --threads=24 --writes_per_second=10240 --numa_aware=[False/True] The tests were run in private devserver with 24 cores and the db was prepopulated using filluniquerandom test. The tests resulted in 0.145 us/op with numa_aware=False and 0.161 us/op with numa_aware=True. Reviewers: sdong, yhchiang, ljin, igor Reviewed By: ljin, igor Subscribers: igor, leveldb Differential Revision: https://reviews.facebook.net/D19353
This commit is contained in:
parent
0bc5fa9f40
commit
f0660d5253
@ -21,6 +21,7 @@
|
||||
# -DLEVELDB_PLATFORM_NOATOMIC if it is not
|
||||
# -DSNAPPY if the Snappy library is present
|
||||
# -DLZ4 if the LZ4 library is present
|
||||
# -DNUMA if the NUMA library is present
|
||||
#
|
||||
# Using gflags in rocksdb:
|
||||
# Our project depends on gflags, which requires users to take some extra steps
|
||||
@ -272,6 +273,17 @@ EOF
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
|
||||
fi
|
||||
|
||||
# Test whether numa is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null <<EOF
|
||||
#include <numa.h>
|
||||
#inlcude <numaif.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
|
||||
fi
|
||||
|
||||
# Test whether tcmalloc is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
|
@ -55,22 +55,27 @@ GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
|
||||
JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
|
||||
JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
|
||||
|
||||
# location of numa
|
||||
NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
|
||||
NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
|
||||
NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=" -msse -msse4.2 "
|
||||
|
||||
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
|
||||
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
|
||||
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
|
||||
|
||||
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
|
||||
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
|
||||
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4"
|
||||
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
|
||||
|
||||
EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
|
||||
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
|
||||
|
||||
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
|
||||
|
||||
|
@ -17,6 +17,11 @@ int main() {
|
||||
}
|
||||
#else
|
||||
|
||||
#ifdef NUMA
|
||||
#include <numa.h>
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <cstddef>
|
||||
#include <sys/types.h>
|
||||
@ -173,6 +178,14 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
|
||||
|
||||
DEFINE_bool(histogram, false, "Print histogram of operation timings");
|
||||
|
||||
DEFINE_bool(enable_numa, false,
|
||||
"Make operations aware of NUMA architecture and bind memory "
|
||||
"and cpus corresponding to nodes together. In NUMA, memory "
|
||||
"in same node as CPUs are closer when compared to memory in "
|
||||
"other nodes. Reads can be faster when the process is bound to "
|
||||
"CPU and memory of same node. Use \"$numactl --hardware\" command "
|
||||
"to see NUMA memory architecture.");
|
||||
|
||||
DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
|
||||
"Number of bytes to buffer in memtable before compacting");
|
||||
|
||||
@ -863,6 +876,18 @@ class Benchmark {
|
||||
* num_)
|
||||
/ 1048576.0));
|
||||
fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
|
||||
if (FLAGS_enable_numa) {
|
||||
fprintf(stderr, "Running in NUMA enabled mode.\n");
|
||||
#ifndef NUMA
|
||||
fprintf(stderr, "NUMA is not defined in the system.\n");
|
||||
exit(1);
|
||||
#else
|
||||
if (numa_available() == -1) {
|
||||
fprintf(stderr, "NUMA is not supported by the system.\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
switch (FLAGS_compression_type_e) {
|
||||
case rocksdb::kNoCompression:
|
||||
fprintf(stdout, "Compression: none\n");
|
||||
@ -1348,7 +1373,25 @@ class Benchmark {
|
||||
shared.start = false;
|
||||
|
||||
ThreadArg* arg = new ThreadArg[n];
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
#ifdef NUMA
|
||||
if (FLAGS_enable_numa) {
|
||||
// Performs a local allocation of memory to threads in numa node.
|
||||
int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
|
||||
numa_exit_on_error = 1;
|
||||
int numa_node = i % n_nodes;
|
||||
bitmask* nodes = numa_allocate_nodemask();
|
||||
numa_bitmask_clearall(nodes);
|
||||
numa_bitmask_setbit(nodes, numa_node);
|
||||
// numa_bind() call binds the process to the node and these
|
||||
// properties are passed on to the thread that is created in
|
||||
// StartThread method called later in the loop.
|
||||
numa_bind(nodes);
|
||||
numa_set_strict(1);
|
||||
numa_free_nodemask(nodes);
|
||||
}
|
||||
#endif
|
||||
arg[i].bm = this;
|
||||
arg[i].method = method;
|
||||
arg[i].shared = &shared;
|
||||
|
Loading…
Reference in New Issue
Block a user