Various fixes.

This commit is contained in:
levlam 2022-02-18 23:04:25 +03:00
parent b5cf85d6e2
commit ae3854d97c
6 changed files with 120 additions and 100 deletions

View File

@ -9,6 +9,9 @@
#include "td/utils/FlatHashMapChunks.h"
#include "td/utils/FlatHashMapLinear.h"
//#include <unordered_map>
//#include <unordered_set>
namespace td {
template <class KeyT, class ValueT, class HashT = std::hash<KeyT>, class EqT = std::equal_to<KeyT>>
//using FlatHashMap = FlatHashMapImpl<KeyT, ValueT, HashT, EqT>;

View File

@ -10,22 +10,22 @@
#include "td/utils/bits.h"
#include "td/utils/common.h"
#include "td/utils/FlatHashMapLinear.h"
#include "td/utils/logging.h"
#include <cstddef>
#include <functional>
#include <initializer_list>
#include <iterator>
#include <new>
#include <limits>
#include <utility>
#if (defined(_MSC_VER) && (defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2)))
#if defined(__SSE2__) || (TD_MSVC && (defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2)))
#define TD_SSE2 1
#endif
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#if TD_SSE2
#include <emmintrin.h>
#endif
@ -33,12 +33,12 @@
namespace td {
template <int shift>
struct MaskIterator {
uint64_t mask;
uint64 mask;
explicit operator bool() const {
return mask != 0;
}
int pos() const {
return td::count_trailing_zeroes64(mask) / shift;
return count_trailing_zeroes64(mask) / shift;
}
void next() {
mask &= mask - 1;
@ -63,8 +63,8 @@ struct MaskIterator {
};
struct MaskPortable {
static MaskIterator<1> equal_mask(uint8_t *bytes, uint8_t needle) {
uint64_t res = 0;
static MaskIterator<1> equal_mask(uint8 *bytes, uint8 needle) {
uint64 res = 0;
for (int i = 0; i < 16; i++) {
res |= (bytes[i] == needle) << i;
}
@ -74,20 +74,20 @@ struct MaskPortable {
#ifdef __aarch64__
struct MaskNeonFolly {
static MaskIterator<4> equal_mask(uint8_t *bytes, uint8_t needle) {
static MaskIterator<4> equal_mask(uint8 *bytes, uint8 needle) {
uint8x16_t input_mask = vld1q_u8(bytes);
auto needle_mask = vdupq_n_u8(needle);
auto eq_mask = vceqq_u8(input_mask, needle_mask);
// get info from every byte into the bottom half of every uint16_t
// get info from every byte into the bottom half of every uint16
// by shifting right 4, then round to get it into a 64-bit vector
uint8x8_t shifted_eq_mask = vshrn_n_u16(vreinterpretq_u16_u8(eq_mask), 4);
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(shifted_eq_mask), 0);
uint64 mask = vget_lane_u64(vreinterpret_u64_u8(shifted_eq_mask), 0);
return {mask & 0x11111111111111};
}
};
struct MaskNeon {
static MaskIterator<1> equal_mask(uint8_t *bytes, uint8_t needle) {
static MaskIterator<1> equal_mask(uint8 *bytes, uint8 needle) {
uint8x16_t input_mask = vld1q_u8(bytes);
auto needle_mask = vdupq_n_u8(needle);
auto eq_mask = vceqq_u8(input_mask, needle_mask);
@ -101,11 +101,11 @@ struct MaskNeon {
};
#elif TD_SSE2
struct MaskSse2 {
static MaskIterator<1> equal_mask(uint8_t *bytes, uint8_t needle) {
static MaskIterator<1> equal_mask(uint8 *bytes, uint8 needle) {
auto input_mask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes));
auto needle_mask = _mm_set1_epi8(needle);
auto match_mask = _mm_cmpeq_epi8(needle_mask, input_mask);
return {static_cast<uint32_t>(_mm_movemask_epi8(match_mask))};
return {static_cast<uint32>(_mm_movemask_epi8(match_mask)) & ((1u << 14) - 1)};
}
};
#endif
@ -224,7 +224,7 @@ class FlatHashTableChunks {
FlatHashTableChunks(std::initializer_list<Node> nodes) {
reserve(nodes.size());
for (auto &node : td::reversed(nodes)) {
for (auto &node : reversed(nodes)) {
CHECK(!node.empty());
if (count(node.first) > 0) {
continue;
@ -351,7 +351,7 @@ class FlatHashTableChunks {
used_nodes_++;
return {{node_it, this}, true};
}
CHECK(chunk.skipped_cnt != std::numeric_limits<uint16_t>::max());
CHECK(chunk.skipped_cnt != std::numeric_limits<uint16>::max());
chunk.skipped_cnt++;
chunk_it.next();
}
@ -413,7 +413,7 @@ class FlatHashTableChunks {
static constexpr int CHUNK_SIZE = 14;
static constexpr int MASK = (1 << CHUNK_SIZE) - 1;
// 0x0 - empty
td::uint8 ctrl[CHUNK_SIZE] = {};
uint8 ctrl[CHUNK_SIZE] = {};
uint16 skipped_cnt{0};
};
fixed_vector<Node> nodes_;
@ -464,7 +464,7 @@ class FlatHashTableChunks {
struct HashInfo {
size_t chunk_i;
uint8_t small_hash;
uint8 small_hash;
};
struct ChunkIt {
size_t chunk_i;
@ -488,7 +488,7 @@ class FlatHashTableChunks {
HashInfo calc_hash(const KeyT &key) {
auto h = HashT()(key);
// TODO: will be problematic with current hash.
return {(h >> 8) % chunks_.size(), uint8_t(0x80 | h)};
return {(h >> 8) % chunks_.size(), static_cast<uint8>(0x80 | h)};
}
void resize(size_t new_size) {
@ -526,7 +526,7 @@ class FlatHashTableChunks {
used_nodes_++;
break;
}
CHECK(chunk.skipped_cnt != std::numeric_limits<uint16_t>::max());
CHECK(chunk.skipped_cnt != std::numeric_limits<uint16>::max());
chunk.skipped_cnt++;
chunk_it.next();
}

View File

@ -9,14 +9,12 @@
#include "td/utils/bits.h"
#include "td/utils/common.h"
#include "td/utils/fixed_vector.h"
#include "td/utils/logging.h"
#include <cstddef>
#include <functional>
#include <initializer_list>
#include <iterator>
#include <new>
#include <unordered_map>
#include <utility>
namespace td {

View File

@ -1,7 +1,17 @@
//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2022
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#pragma once
#include "td/utils/common.h"
#include <utility>
namespace td {
template <class T>
class fixed_vector {
public:
@ -20,12 +30,17 @@ class fixed_vector {
~fixed_vector() {
delete[] ptr_;
}
using iterator = T *;
using const_iterator = const T *;
T &operator[](size_t i) {
return ptr_[i];
}
const T &operator[](size_t i) const {
return ptr_[i];
}
T *begin() {
return ptr_;
}
@ -38,14 +53,14 @@ class fixed_vector {
const T *end() const {
return ptr_ + size_;
}
bool empty() const {
return size() == 0;
}
size_t size() const {
return size_;
}
using iterator = T *;
using const_iterator = const T *;
void swap(fixed_vector<T> &other) {
std::swap(ptr_, other.ptr_);
std::swap(size_, other.size_);
@ -55,4 +70,5 @@ class fixed_vector {
T *ptr_{};
size_t size_{0};
};
} // namespace td

View File

@ -8,6 +8,7 @@
#include "td/utils/common.h"
#include "td/utils/FlatHashMap.h"
#include "td/utils/FlatHashMapChunks.h"
#include "td/utils/logging.h"
#include "td/utils/Random.h"
#include "td/utils/Slice.h"
#include "td/utils/tests.h"
@ -44,10 +45,10 @@ TEST(FlatHashMapChunks, basic) {
TEST(FlatHashMap, probing) {
auto test = [](int buckets, int elements) {
CHECK(buckets >= elements);
std::vector<bool> data(buckets, false);
td::vector<bool> data(buckets, false);
std::random_device rnd;
std::mt19937 mt(rnd());
std::uniform_int_distribution<int32_t> d(0, buckets - 1);
std::uniform_int_distribution<td::int32> d(0, buckets - 1);
for (int i = 0; i < elements; i++) {
int pos = d(mt);
while (data[pos]) {
@ -63,16 +64,16 @@ TEST(FlatHashMap, probing) {
for (auto x : data) {
if (x) {
cur_chain++;
max_chain = std::max(max_chain, cur_chain);
max_chain = td::max(max_chain, cur_chain);
} else {
cur_chain = 0;
}
}
LOG(ERROR) << "buckets=" << buckets << " elements=" << elements << " max_chain=" << max_chain;
LOG(INFO) << "Buckets=" << buckets << " elements=" << elements << " max_chain=" << max_chain;
};
test(8192, int(8192 * 0.8));
test(8192, int(8192 * 0.6));
test(8192, int(8192 * 0.3));
test(8192, static_cast<int>(8192 * 0.8));
test(8192, static_cast<int>(8192 * 0.6));
test(8192, static_cast<int>(8192 * 0.3));
}
TEST(FlatHashSet, TL) {
@ -187,7 +188,7 @@ TEST(FlatHashMap, basic) {
TEST(FlatHashMap, remove_if_basic) {
td::Random::Xorshift128plus rnd(123);
constexpr int TESTS_N = 10000;
constexpr int TESTS_N = 1000;
constexpr int MAX_TABLE_SIZE = 1000;
for (int test_i = 0; test_i < TESTS_N; test_i++) {
std::unordered_map<td::uint64, td::uint64> reference;
@ -234,7 +235,7 @@ TEST(FlatHashMap, stress_test) {
td::vector<td::RandomSteps::Step> steps;
auto add_step = [&](td::Slice step_name, td::uint32 weight, auto f) {
auto g = [&, step_name, f = std::move(f)]() {
auto g = [&, step_name, f = std::move(f)] {
//LOG(ERROR) << step_name;
//ASSERT_EQ(ref.size(), tbl.size());
f();
@ -328,7 +329,7 @@ TEST(FlatHashMap, stress_test) {
});
td::RandomSteps runner(std::move(steps));
for (size_t i = 0; i < 10000000; i++) {
for (size_t i = 0; i < 1000000; i++) {
runner.step(rnd);
}
}

View File

@ -23,16 +23,17 @@
#include <benchmark/benchmark.h>
#include <folly/container/F14Map.h>
#include <map>
#include <random>
#include <unordered_map>
#include <utility>
template <class TableT>
static void reserve(TableT &table, size_t size) {
static void reserve(TableT &table, std::size_t size) {
table.reserve(size);
}
template <class A, class B>
static void reserve(std::map<A, B> &table, size_t size) {
static void reserve(std::map<A, B> &table, std::size_t size) {
}
template <class KeyT, class ValueT>
@ -132,7 +133,7 @@ class SimpleHashTable {
ValueT *find(const KeyT &needle) {
auto hash = HashT()(needle);
size_t i = hash % nodes_.size();
std::size_t i = hash % nodes_.size();
while (true) {
if (nodes_[i].key == needle) {
return &nodes_[i].value;
@ -185,7 +186,7 @@ static void BM_Get(benchmark::State &state) {
td::vector<KeyValue> data;
td::vector<Key> keys;
for (size_t i = 0; i < n; i++) {
for (std::size_t i = 0; i < n; i++) {
auto key = rnd();
auto value = rnd();
data.emplace_back(key, value);
@ -193,7 +194,7 @@ static void BM_Get(benchmark::State &state) {
}
TableT table(data.begin(), data.end());
size_t key_i = 0;
std::size_t key_i = 0;
td::random_shuffle(td::as_mutable_span(keys), rnd);
auto next_key = [&] {
key_i++;
@ -204,7 +205,7 @@ static void BM_Get(benchmark::State &state) {
};
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
benchmark::DoNotOptimize(table.find(next_key()));
}
}
@ -214,11 +215,11 @@ template <typename TableT>
static void BM_find_same(benchmark::State &state) {
td::Random::Xorshift128plus rnd(123);
TableT table;
constexpr size_t N = 100000;
constexpr size_t BATCH_SIZE = 1024;
constexpr std::size_t N = 100000;
constexpr std::size_t BATCH_SIZE = 1024;
reserve(table, N);
for (size_t i = 0; i < N; i++) {
for (std::size_t i = 0; i < N; i++) {
table.emplace(rnd(), i);
}
@ -226,7 +227,7 @@ static void BM_find_same(benchmark::State &state) {
table[key] = 123;
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
benchmark::DoNotOptimize(table.find(key));
}
}
@ -236,11 +237,11 @@ template <typename TableT>
static void BM_emplace_same(benchmark::State &state) {
td::Random::Xorshift128plus rnd(123);
TableT table;
constexpr size_t N = 100000;
constexpr size_t BATCH_SIZE = 1024;
constexpr std::size_t N = 100000;
constexpr std::size_t BATCH_SIZE = 1024;
reserve(table, N);
for (size_t i = 0; i < N; i++) {
for (std::size_t i = 0; i < N; i++) {
table.emplace(rnd(), i);
}
@ -248,7 +249,7 @@ static void BM_emplace_same(benchmark::State &state) {
table[key] = 123;
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
benchmark::DoNotOptimize(table.emplace(key + (i & 15) * 100, 43784932));
}
}
@ -271,15 +272,15 @@ static void table_remove_if(absl::flat_hash_map<K, V> &table, FunctT &&func) {
template <typename TableT>
static void BM_remove_if(benchmark::State &state) {
constexpr size_t N = 100000;
constexpr size_t BATCH_SIZE = N;
constexpr std::size_t N = 100000;
constexpr std::size_t BATCH_SIZE = N;
TableT table;
reserve(table, N);
while (state.KeepRunningBatch(BATCH_SIZE)) {
state.PauseTiming();
td::Random::Xorshift128plus rnd(123);
for (size_t i = 0; i < N; i++) {
for (std::size_t i = 0; i < N; i++) {
table.emplace(rnd(), i);
}
state.ResumeTiming();
@ -290,13 +291,13 @@ static void BM_remove_if(benchmark::State &state) {
template <typename TableT>
static void BM_erase_all_with_begin(benchmark::State &state) {
constexpr size_t N = 100000;
constexpr size_t BATCH_SIZE = N;
constexpr std::size_t N = 100000;
constexpr std::size_t BATCH_SIZE = N;
TableT table;
td::Random::Xorshift128plus rnd(123);
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
table.emplace(rnd() + 1, i);
}
while (!table.empty()) {
@ -307,14 +308,14 @@ static void BM_erase_all_with_begin(benchmark::State &state) {
template <typename TableT>
static void BM_cache(benchmark::State &state) {
constexpr size_t N = 1000;
constexpr size_t BATCH_SIZE = 1000000;
constexpr std::size_t N = 1000;
constexpr std::size_t BATCH_SIZE = 1000000;
TableT table;
td::Random::Xorshift128plus rnd(123);
td::VectorQueue<td::uint64> keys;
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
auto key = rnd() + 1;
keys.push(key);
table.emplace(key, i);
@ -327,14 +328,14 @@ static void BM_cache(benchmark::State &state) {
template <typename TableT>
static void BM_cache2(benchmark::State &state) {
constexpr size_t N = 1000;
constexpr size_t BATCH_SIZE = 1000000;
constexpr std::size_t N = 1000;
constexpr std::size_t BATCH_SIZE = 1000000;
TableT table;
td::Random::Xorshift128plus rnd(123);
td::VectorQueue<td::uint64> keys;
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
auto key = rnd() + 1;
keys.push(key);
table.emplace(key, i);
@ -347,20 +348,20 @@ static void BM_cache2(benchmark::State &state) {
template <typename TableT>
static void BM_cache3(benchmark::State &state) {
size_t N = state.range(0);
constexpr size_t BATCH_SIZE = 1000000;
std::size_t N = state.range(0);
constexpr std::size_t BATCH_SIZE = 1000000;
TableT table;
td::Random::Xorshift128plus rnd(123);
td::VectorQueue<td::uint64> keys;
size_t step = 20;
std::size_t step = 20;
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i += step) {
for (std::size_t i = 0; i < BATCH_SIZE; i += step) {
auto key = rnd() + 1;
keys.push(key);
table.emplace(key, i);
for (size_t j = 1; j < step; j++) {
for (std::size_t j = 1; j < step; j++) {
auto key_to_find = keys.data()[rnd() % keys.size()];
benchmark::DoNotOptimize(table.find(key_to_find));
}
@ -373,24 +374,24 @@ static void BM_cache3(benchmark::State &state) {
}
template <typename TableT>
static void BM_remove_if_slow(benchmark::State &state) {
constexpr size_t N = 5000;
constexpr size_t BATCH_SIZE = 500000;
constexpr std::size_t N = 5000;
constexpr std::size_t BATCH_SIZE = 500000;
TableT table;
td::Random::Xorshift128plus rnd(123);
for (size_t i = 0; i < N; i++) {
for (std::size_t i = 0; i < N; i++) {
table.emplace(rnd() + 1, i);
}
auto first_key = table.begin()->first;
{
size_t cnt = 0;
td::table_remove_if(table, [&cnt](auto &) {
std::size_t cnt = 0;
td::table_remove_if(table, [&cnt, n = N](auto &) {
cnt += 2;
return cnt <= N;
return cnt <= n;
});
}
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
table.emplace(first_key, i);
table.erase(first_key);
}
@ -398,16 +399,16 @@ static void BM_remove_if_slow(benchmark::State &state) {
}
template <typename TableT>
static void BM_remove_if_slow_old(benchmark::State &state) {
constexpr size_t N = 100000;
constexpr size_t BATCH_SIZE = 5000000;
constexpr std::size_t N = 100000;
constexpr std::size_t BATCH_SIZE = 5000000;
TableT table;
while (state.KeepRunningBatch(BATCH_SIZE)) {
td::Random::Xorshift128plus rnd(123);
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
table.emplace(rnd() + 1, i);
if (table.size() > N) {
size_t cnt = 0;
std::size_t cnt = 0;
td::table_remove_if(table, [&cnt, n = N](auto &) {
cnt += 2;
return cnt <= n;
@ -421,11 +422,11 @@ template <typename TableT>
static void benchmark_create(td::Slice name) {
td::Random::Xorshift128plus rnd(123);
{
constexpr size_t N = 10000000;
constexpr std::size_t N = 10000000;
TableT table;
reserve(table, N);
auto start = td::Timestamp::now();
for (size_t i = 0; i < N; i++) {
for (std::size_t i = 0; i < N; i++) {
table.emplace(rnd(), i);
}
auto end = td::Timestamp::now();
@ -433,8 +434,8 @@ static void benchmark_create(td::Slice name) {
<< "create " << N << " elements: " << td::format::as_time(end.at() - start.at());
double res = 0;
td::vector<std::pair<size_t, td::format::Time>> pauses;
for (size_t i = 0; i < N; i++) {
td::vector<std::pair<std::size_t, td::format::Time>> pauses;
for (std::size_t i = 0; i < N; i++) {
auto emplace_start = td::Timestamp::now();
table.emplace(rnd(), i);
auto emplace_end = td::Timestamp::now();
@ -451,15 +452,15 @@ static void benchmark_create(td::Slice name) {
}
struct CacheMissNode {
uint32_t data{};
td::uint32 data{};
char padding[64 - sizeof(data)];
};
class IterateFast {
public:
static __attribute__((noinline)) uint32_t iterate(CacheMissNode *ptr, size_t max_shift) {
uint32_t res = 1;
for (size_t i = 0; i < max_shift; i++) {
static td::uint32 iterate(CacheMissNode *ptr, std::size_t max_shift) {
td::uint32 res = 1;
for (std::size_t i = 0; i < max_shift; i++) {
if (ptr[i].data % max_shift != 0) {
res *= ptr[i].data;
} else {
@ -472,9 +473,9 @@ class IterateFast {
class IterateSlow {
public:
static __attribute__((noinline)) uint32_t iterate(CacheMissNode *ptr, size_t max_shift) {
uint32_t res = 1;
for (size_t i = 0;; i++) {
static td::uint32 iterate(CacheMissNode *ptr, std::size_t max_shift) {
td::uint32 res = 1;
for (std::size_t i = 0;; i++) {
if (ptr[i].data % max_shift != 0) {
res *= ptr[i].data;
} else {
@ -484,16 +485,16 @@ class IterateSlow {
return res;
}
};
#include <random>
template <class F>
void BM_cache_miss(benchmark::State &state) {
uint32_t max_shift = state.range(0);
static void BM_cache_miss(benchmark::State &state) {
td::uint32 max_shift = state.range(0);
bool flag = state.range(1);
std::random_device rd;
std::mt19937 rnd(rd());
int N = 50000000;
std::vector<CacheMissNode> nodes(N);
uint32_t i = 0;
td::vector<CacheMissNode> nodes(N);
td::uint32 i = 0;
for (auto &node : nodes) {
if (flag) {
node.data = i++ % max_shift;
@ -502,8 +503,8 @@ void BM_cache_miss(benchmark::State &state) {
}
}
std::vector<int> positions(N);
std::uniform_int_distribution<uint32_t> rnd_pos(0, N - 1000);
td::vector<int> positions(N);
std::uniform_int_distribution<td::uint32> rnd_pos(0, N - 1000);
for (auto &pos : positions) {
pos = rnd_pos(rnd);
if (flag) {
@ -520,7 +521,7 @@ void BM_cache_miss(benchmark::State &state) {
}
}
uint64_t equal_mask_slow(uint8_t *bytes, uint8_t needle) {
static uint64_t equal_mask_slow(td::uint8 *bytes, td::uint8 needle) {
uint64_t mask = 0;
for (int i = 0; i < 16; i++) {
mask |= (bytes[i] == needle) << i;
@ -529,19 +530,20 @@ uint64_t equal_mask_slow(uint8_t *bytes, uint8_t needle) {
}
template <class MaskT>
void BM_mask(benchmark::State &state) {
size_t BATCH_SIZE = 1024;
std::vector<uint8_t> bytes(BATCH_SIZE + 16);
static void BM_mask(benchmark::State &state) {
std::size_t BATCH_SIZE = 1024;
td::vector<td::uint8> bytes(BATCH_SIZE + 16);
for (auto &b : bytes) {
b = static_cast<uint8_t>(td::Random::fast(0, 17));
b = static_cast<td::uint8>(td::Random::fast(0, 17));
}
while (state.KeepRunningBatch(BATCH_SIZE)) {
for (size_t i = 0; i < BATCH_SIZE; i++) {
for (std::size_t i = 0; i < BATCH_SIZE; i++) {
benchmark::DoNotOptimize(MaskT::equal_mask(bytes.data() + i, 17));
}
}
}
BENCHMARK_TEMPLATE(BM_mask, td::MaskPortable);
#ifdef __aarch64__
BENCHMARK_TEMPLATE(BM_mask, td::MaskNeonFolly);