Add watchdog for main thread hanging.

This commit is contained in:
levlam 2022-08-22 02:26:36 +03:00
parent 90910f6ded
commit aa9eff357c
4 changed files with 78 additions and 2 deletions

View File

@ -85,6 +85,7 @@ set(TELEGRAM_BOT_API_SOURCE
telegram-bot-api/HttpStatConnection.cpp telegram-bot-api/HttpStatConnection.cpp
telegram-bot-api/Query.cpp telegram-bot-api/Query.cpp
telegram-bot-api/Stats.cpp telegram-bot-api/Stats.cpp
telegram-bot-api/Watchdog.cpp
telegram-bot-api/WebhookActor.cpp telegram-bot-api/WebhookActor.cpp
telegram-bot-api/Client.h telegram-bot-api/Client.h
@ -95,6 +96,7 @@ set(TELEGRAM_BOT_API_SOURCE
telegram-bot-api/HttpStatConnection.h telegram-bot-api/HttpStatConnection.h
telegram-bot-api/Query.h telegram-bot-api/Query.h
telegram-bot-api/Stats.h telegram-bot-api/Stats.h
telegram-bot-api/Watchdog.h
telegram-bot-api/WebhookActor.h telegram-bot-api/WebhookActor.h
) )

View File

@ -0,0 +1,27 @@
//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#include "telegram-bot-api/Watchdog.h"
#include "td/utils/Time.h"
namespace telegram_bot_api {
void Watchdog::kick() {
auto now = td::Time::now();
if (now >= last_kick_time_ + timeout_ && last_kick_time_ > 0) {
LOG(ERROR) << "Watchdog timeout expired after " << now - last_kick_time_ << " seconds";
td::thread::send_real_time_signal(main_thread_id_, 2);
}
last_kick_time_ = now;
set_timeout_in(timeout_);
}
void Watchdog::timeout_expired() {
kick();
}
} // namespace telegram_bot_api

View File

@ -0,0 +1,31 @@
//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#pragma once
#include "td/actor/actor.h"
#include "td/utils/port/thread.h"
namespace telegram_bot_api {
class Watchdog final : public td::Actor {
public:
Watchdog(td::thread::id main_thread_id, double timeout) : main_thread_id_(main_thread_id), timeout_(timeout) {
// watchdog is disabled until it is kicked for the first time
}
void kick();
private:
void timeout_expired() final;
td::thread::id main_thread_id_;
double timeout_;
double last_kick_time_ = 0.0;
};
} // namespace telegram_bot_api

View File

@ -11,6 +11,7 @@
#include "telegram-bot-api/HttpStatConnection.h" #include "telegram-bot-api/HttpStatConnection.h"
#include "telegram-bot-api/Query.h" #include "telegram-bot-api/Query.h"
#include "telegram-bot-api/Stats.h" #include "telegram-bot-api/Stats.h"
#include "telegram-bot-api/Watchdog.h"
#include "td/telegram/ClientActor.h" #include "td/telegram/ClientActor.h"
@ -462,7 +463,7 @@ int main(int argc, char *argv[]) {
// << (td::GitInfo::is_dirty() ? "(dirty)" : "") << " started"; // << (td::GitInfo::is_dirty() ? "(dirty)" : "") << " started";
LOG(WARNING) << "Bot API " << parameters->version_ << " server started"; LOG(WARNING) << "Bot API " << parameters->version_ << " server started";
const int threads_n = 5; // +3 for Td, one for slow HTTP connections and one for DNS resolving const int threads_n = 6; // +3 for Td, one for watchdog, one for slow HTTP connections, one for DNS resolving
td::ConcurrentScheduler sched; td::ConcurrentScheduler sched;
sched.init(threads_n); sched.init(threads_n);
@ -474,6 +475,7 @@ int main(int argc, char *argv[]) {
auto client_manager = auto client_manager =
sched.create_actor_unsafe<ClientManager>(0, "ClientManager", std::move(parameters), token_range).release(); sched.create_actor_unsafe<ClientManager>(0, "ClientManager", std::move(parameters), token_range).release();
sched sched
.create_actor_unsafe<HttpServer>( .create_actor_unsafe<HttpServer>(
0, "HttpServer", http_ip_address, http_port, 0, "HttpServer", http_ip_address, http_port,
@ -482,6 +484,7 @@ int main(int argc, char *argv[]) {
td::create_actor<HttpConnection>("HttpConnection", client_manager, shared_data)); td::create_actor<HttpConnection>("HttpConnection", client_manager, shared_data));
}) })
.release(); .release();
if (http_stat_port != 0) { if (http_stat_port != 0) {
sched sched
.create_actor_unsafe<HttpServer>( .create_actor_unsafe<HttpServer>(
@ -492,8 +495,14 @@ int main(int argc, char *argv[]) {
}) })
.release(); .release();
} }
constexpr double WATCHDOG_TIMEOUT = 0.5;
auto watchdog_id =
sched.create_actor_unsafe<Watchdog>(threads_n - 2, "Watchdog", td::this_thread::get_id(), WATCHDOG_TIMEOUT);
sched.start(); sched.start();
double next_watchdog_kick_time = start_time;
double next_cron_time = start_time; double next_cron_time = start_time;
double last_dump_time = start_time - 1000.0; double last_dump_time = start_time - 1000.0;
double last_tqueue_gc_time = start_time - 1000.0; double last_tqueue_gc_time = start_time - 1000.0;
@ -503,7 +512,7 @@ int main(int argc, char *argv[]) {
std::atomic_bool can_quit{false}; std::atomic_bool can_quit{false};
ServerCpuStat::instance(); // create ServerCpuStat instance ServerCpuStat::instance(); // create ServerCpuStat instance
while (true) { while (true) {
sched.run_main(next_cron_time - td::Time::now()); sched.run_main(td::min(next_cron_time, next_watchdog_kick_time) - td::Time::now());
if (!need_reopen_log.test_and_set()) { if (!need_reopen_log.test_and_set()) {
td::log_interface->after_rotation(); td::log_interface->after_rotation();
@ -519,6 +528,7 @@ int main(int argc, char *argv[]) {
dump_statistics(shared_data, net_query_stats); dump_statistics(shared_data, net_query_stats);
close_flag = true; close_flag = true;
auto guard = sched.get_main_guard(); auto guard = sched.get_main_guard();
watchdog_id.reset();
send_closure(client_manager, &ClientManager::close, td::PromiseCreator::lambda([&can_quit](td::Unit) { send_closure(client_manager, &ClientManager::close, td::PromiseCreator::lambda([&can_quit](td::Unit) {
can_quit.store(true); can_quit.store(true);
td::Scheduler::instance()->yield(); td::Scheduler::instance()->yield();
@ -557,6 +567,12 @@ int main(int argc, char *argv[]) {
ServerCpuStat::update(now); ServerCpuStat::update(now);
} }
if (now >= start_time + 600) {
auto guard = sched.get_main_guard();
send_closure(watchdog_id, &Watchdog::kick);
next_watchdog_kick_time = now + WATCHDOG_TIMEOUT / 2;
}
if (now > last_tqueue_gc_time + 60.0) { if (now > last_tqueue_gc_time + 60.0) {
auto unix_time = shared_data->get_unix_time(now); auto unix_time = shared_data->get_unix_time(now);
LOG(INFO) << "Run TQueue GC at " << unix_time; LOG(INFO) << "Run TQueue GC at " << unix_time;