2020-10-09 17:35:53 +03:00
|
|
|
//
|
2023-01-01 00:28:08 +03:00
|
|
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2023
|
2020-10-09 17:35:53 +03:00
|
|
|
//
|
|
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
//
|
2020-10-12 10:24:30 +03:00
|
|
|
#include "td/utils/common.h"
|
2020-10-09 17:35:53 +03:00
|
|
|
#include "td/utils/misc.h"
|
2020-10-12 10:24:30 +03:00
|
|
|
#include "td/utils/Slice.h"
|
|
|
|
#include "td/utils/Status.h"
|
|
|
|
#include "td/utils/tests.h"
|
2020-10-09 17:35:53 +03:00
|
|
|
#include "td/utils/utf8.h"
|
|
|
|
|
2020-10-12 10:24:30 +03:00
|
|
|
#include <algorithm>
|
|
|
|
|
2020-10-09 17:35:53 +03:00
|
|
|
namespace td {
|
2020-10-12 10:24:30 +03:00
|
|
|
|
2020-10-09 17:35:53 +03:00
|
|
|
class RangeSet {
|
|
|
|
template <class T>
|
|
|
|
static auto find(T &ranges, int64 begin) {
|
|
|
|
return std::lower_bound(ranges.begin(), ranges.end(), begin,
|
|
|
|
[](const Range &range, int64 begin) { return range.end < begin; });
|
|
|
|
}
|
|
|
|
auto find(int64 begin) const {
|
|
|
|
return find(ranges_, begin);
|
|
|
|
}
|
|
|
|
auto find(int64 begin) {
|
|
|
|
return find(ranges_, begin);
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
struct Range {
|
|
|
|
int64 begin;
|
|
|
|
int64 end;
|
|
|
|
};
|
|
|
|
|
2020-10-12 10:24:30 +03:00
|
|
|
static constexpr int64 BIT_SIZE = 1024;
|
2020-10-09 17:35:53 +03:00
|
|
|
|
|
|
|
static RangeSet create_one_range(int64 end, int64 begin = 0) {
|
|
|
|
RangeSet res;
|
|
|
|
res.ranges_.push_back({begin, end});
|
|
|
|
return res;
|
|
|
|
}
|
2020-10-12 10:24:30 +03:00
|
|
|
static Result<RangeSet> decode(CSlice data) {
|
2020-10-09 17:35:53 +03:00
|
|
|
if (!check_utf8(data)) {
|
|
|
|
return Status::Error("Invalid encoding");
|
|
|
|
}
|
|
|
|
uint32 curr = 0;
|
|
|
|
bool is_empty = false;
|
|
|
|
RangeSet res;
|
|
|
|
for (auto begin = data.ubegin(); begin != data.uend();) {
|
|
|
|
uint32 size;
|
2022-08-19 16:37:44 +03:00
|
|
|
begin = next_utf8_unsafe(begin, &size);
|
2020-10-09 17:35:53 +03:00
|
|
|
|
|
|
|
if (!is_empty && size != 0) {
|
2020-10-12 10:24:30 +03:00
|
|
|
res.ranges_.push_back({curr * BIT_SIZE, (curr + size) * BIT_SIZE});
|
2020-10-09 17:35:53 +03:00
|
|
|
}
|
|
|
|
curr += size;
|
|
|
|
is_empty = !is_empty;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2020-10-12 10:24:30 +03:00
|
|
|
string encode(int64 prefix_size = -1) const {
|
|
|
|
vector<uint32> sizes;
|
2020-10-09 17:35:53 +03:00
|
|
|
uint32 all_end = 0;
|
|
|
|
|
|
|
|
if (prefix_size != -1) {
|
2020-10-12 10:24:30 +03:00
|
|
|
prefix_size = (prefix_size + BIT_SIZE - 1) / BIT_SIZE * BIT_SIZE;
|
2020-10-09 17:35:53 +03:00
|
|
|
}
|
|
|
|
for (auto it : ranges_) {
|
|
|
|
if (prefix_size != -1 && it.begin >= prefix_size) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (prefix_size != -1 && it.end > prefix_size) {
|
|
|
|
it.end = prefix_size;
|
|
|
|
}
|
|
|
|
|
2020-10-12 10:24:30 +03:00
|
|
|
CHECK(it.begin % BIT_SIZE == 0);
|
|
|
|
CHECK(it.end % BIT_SIZE == 0);
|
2021-10-20 01:27:02 +03:00
|
|
|
auto begin = narrow_cast<uint32>(it.begin / BIT_SIZE);
|
|
|
|
auto end = narrow_cast<uint32>(it.end / BIT_SIZE);
|
2020-10-09 17:35:53 +03:00
|
|
|
if (sizes.empty()) {
|
|
|
|
if (begin != 0) {
|
|
|
|
sizes.push_back(0);
|
|
|
|
sizes.push_back(begin);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
sizes.push_back(begin - all_end);
|
|
|
|
}
|
|
|
|
sizes.push_back(end - begin);
|
|
|
|
all_end = end;
|
|
|
|
}
|
|
|
|
|
2020-10-12 10:24:30 +03:00
|
|
|
string res;
|
2020-10-09 17:35:53 +03:00
|
|
|
for (auto c : sizes) {
|
|
|
|
append_utf8_character(res, c);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
int64 get_ready_prefix_size(int64 offset, int64 file_size = -1) const {
|
|
|
|
auto it = find(offset);
|
|
|
|
if (it == ranges_.end()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (it->begin > offset) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
CHECK(offset >= it->begin);
|
|
|
|
CHECK(offset <= it->end);
|
|
|
|
auto end = it->end;
|
|
|
|
if (file_size != -1 && end > file_size) {
|
|
|
|
end = file_size;
|
|
|
|
}
|
|
|
|
if (end < offset) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return end - offset;
|
|
|
|
}
|
|
|
|
int64 get_total_size(int64 file_size) const {
|
|
|
|
int64 res = 0;
|
|
|
|
for (auto it : ranges_) {
|
|
|
|
if (it.begin >= file_size) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (it.end > file_size) {
|
|
|
|
it.end = file_size;
|
|
|
|
}
|
|
|
|
res += it.end - it.begin;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
2021-03-16 14:35:32 +03:00
|
|
|
int64 get_ready_parts(int64 offset_part, int32 part_size) const {
|
2020-10-09 17:35:53 +03:00
|
|
|
auto offset = offset_part * part_size;
|
|
|
|
auto it = find(offset);
|
|
|
|
if (it == ranges_.end()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (it->begin > offset) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return (it->end - offset) / part_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is_ready(int64 begin, int64 end) const {
|
|
|
|
auto it = find(begin);
|
|
|
|
if (it == ranges_.end()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return it->begin <= begin && end <= it->end;
|
|
|
|
}
|
|
|
|
|
|
|
|
void set(int64 begin, int64 end) {
|
2020-10-12 10:24:30 +03:00
|
|
|
CHECK(begin % BIT_SIZE == 0);
|
|
|
|
CHECK(end % BIT_SIZE == 0);
|
2020-10-09 17:35:53 +03:00
|
|
|
// 1. skip all with r.end < begin
|
|
|
|
auto it_begin = find(begin);
|
|
|
|
|
|
|
|
// 2. combine with all r.begin <= end
|
|
|
|
auto it_end = it_begin;
|
|
|
|
for (; it_end != ranges_.end() && it_end->begin <= end; ++it_end) {
|
|
|
|
}
|
|
|
|
|
|
|
|
if (it_begin == it_end) {
|
|
|
|
ranges_.insert(it_begin, Range{begin, end});
|
|
|
|
} else {
|
2020-10-12 10:24:30 +03:00
|
|
|
begin = td::min(begin, it_begin->begin);
|
2020-10-09 17:35:53 +03:00
|
|
|
--it_end;
|
2020-10-12 10:24:30 +03:00
|
|
|
end = td::max(end, it_end->end);
|
2020-10-09 17:35:53 +03:00
|
|
|
*it_end = Range{begin, end};
|
|
|
|
ranges_.erase(it_begin, it_end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-12 10:24:30 +03:00
|
|
|
vector<int32> as_vector(int32 part_size) const {
|
|
|
|
vector<int32> res;
|
2021-11-10 20:39:35 +03:00
|
|
|
for (const auto &it : ranges_) {
|
2020-10-09 17:35:53 +03:00
|
|
|
auto begin = narrow_cast<int32>((it.begin + part_size - 1) / part_size);
|
|
|
|
auto end = narrow_cast<int32>(it.end / part_size);
|
|
|
|
while (begin < end) {
|
|
|
|
res.push_back(begin++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2020-10-12 10:24:30 +03:00
|
|
|
vector<Range> ranges_;
|
2020-10-09 17:35:53 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
TEST(Bitmask, simple) {
|
|
|
|
auto validate_encoding = [](auto &rs) {
|
|
|
|
auto str = rs.encode();
|
|
|
|
RangeSet rs2 = RangeSet::decode(str).move_as_ok();
|
|
|
|
auto str2 = rs2.encode();
|
|
|
|
rs = std::move(rs2);
|
|
|
|
CHECK(str2 == str);
|
|
|
|
};
|
|
|
|
{
|
|
|
|
RangeSet rs;
|
|
|
|
int32 S = 128 * 1024;
|
|
|
|
int32 O = S * 5000;
|
|
|
|
for (int i = 1; i < 30; i++) {
|
|
|
|
if (i % 2 == 0) {
|
|
|
|
rs.set(O + S * i, O + S * (i + 1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
validate_encoding(rs);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
RangeSet rs;
|
|
|
|
int32 S = 1024;
|
|
|
|
auto get = [&](auto p) {
|
|
|
|
return rs.get_ready_prefix_size(p * S) / S;
|
|
|
|
};
|
|
|
|
auto set = [&](auto l, auto r) {
|
|
|
|
rs.set(l * S, r * S);
|
|
|
|
validate_encoding(rs);
|
|
|
|
ASSERT_TRUE(rs.is_ready(l * S, r * S));
|
|
|
|
ASSERT_TRUE(get(l) >= (r - l));
|
|
|
|
};
|
|
|
|
set(6, 7);
|
|
|
|
ASSERT_EQ(1, get(6));
|
|
|
|
ASSERT_EQ(0, get(5));
|
|
|
|
set(8, 9);
|
|
|
|
ASSERT_EQ(0, get(7));
|
|
|
|
set(7, 8);
|
|
|
|
ASSERT_EQ(2, get(7));
|
|
|
|
ASSERT_EQ(3, get(6));
|
|
|
|
set(3, 5);
|
|
|
|
ASSERT_EQ(1, get(4));
|
|
|
|
set(4, 6);
|
|
|
|
ASSERT_EQ(5, get(4));
|
|
|
|
set(10, 11);
|
|
|
|
set(9, 10);
|
|
|
|
ASSERT_EQ(8, get(3));
|
|
|
|
set(14, 16);
|
|
|
|
set(12, 13);
|
|
|
|
ASSERT_EQ(8, get(3));
|
|
|
|
|
|
|
|
ASSERT_EQ(10, rs.get_ready_prefix_size(S * 3, S * 3 + 10));
|
2020-10-12 10:24:30 +03:00
|
|
|
ASSERT_TRUE(!rs.is_ready(S * 11, S * 12));
|
2020-10-09 17:35:53 +03:00
|
|
|
ASSERT_EQ(3, rs.get_ready_parts(2, S * 2));
|
2020-10-12 10:24:30 +03:00
|
|
|
ASSERT_EQ(vector<int32>({2, 3, 4, 7}), rs.as_vector(S * 2));
|
2020-10-09 17:35:53 +03:00
|
|
|
}
|
|
|
|
}
|
2020-10-12 10:24:30 +03:00
|
|
|
|
2020-10-09 17:35:53 +03:00
|
|
|
} // namespace td
|