From aa9eff357c6ad3c254a3e4b53df37e2b82aa4b30 Mon Sep 17 00:00:00 2001 From: levlam Date: Mon, 22 Aug 2022 02:26:36 +0300 Subject: [PATCH] Add watchdog for main thread hanging. --- CMakeLists.txt | 2 ++ telegram-bot-api/Watchdog.cpp | 27 +++++++++++++++++++++++ telegram-bot-api/Watchdog.h | 31 +++++++++++++++++++++++++++ telegram-bot-api/telegram-bot-api.cpp | 20 +++++++++++++++-- 4 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 telegram-bot-api/Watchdog.cpp create mode 100644 telegram-bot-api/Watchdog.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 45d5df6..5f1bad6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,7 @@ set(TELEGRAM_BOT_API_SOURCE telegram-bot-api/HttpStatConnection.cpp telegram-bot-api/Query.cpp telegram-bot-api/Stats.cpp + telegram-bot-api/Watchdog.cpp telegram-bot-api/WebhookActor.cpp telegram-bot-api/Client.h @@ -95,6 +96,7 @@ set(TELEGRAM_BOT_API_SOURCE telegram-bot-api/HttpStatConnection.h telegram-bot-api/Query.h telegram-bot-api/Stats.h + telegram-bot-api/Watchdog.h telegram-bot-api/WebhookActor.h ) diff --git a/telegram-bot-api/Watchdog.cpp b/telegram-bot-api/Watchdog.cpp new file mode 100644 index 0000000..d715574 --- /dev/null +++ b/telegram-bot-api/Watchdog.cpp @@ -0,0 +1,27 @@ +// +// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021 +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#include "telegram-bot-api/Watchdog.h" + +#include "td/utils/Time.h" + +namespace telegram_bot_api { + +void Watchdog::kick() { + auto now = td::Time::now(); + if (now >= last_kick_time_ + timeout_ && last_kick_time_ > 0) { + LOG(ERROR) << "Watchdog timeout expired after " << now - last_kick_time_ << " seconds"; + td::thread::send_real_time_signal(main_thread_id_, 2); + } + last_kick_time_ = now; + set_timeout_in(timeout_); +} + +void Watchdog::timeout_expired() { + kick(); +} + +} // namespace telegram_bot_api diff --git a/telegram-bot-api/Watchdog.h b/telegram-bot-api/Watchdog.h new file mode 100644 index 0000000..5ab0115 --- /dev/null +++ b/telegram-bot-api/Watchdog.h @@ -0,0 +1,31 @@ +// +// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021 +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#pragma once + +#include "td/actor/actor.h" + +#include "td/utils/port/thread.h" + +namespace telegram_bot_api { + +class Watchdog final : public td::Actor { + public: + Watchdog(td::thread::id main_thread_id, double timeout) : main_thread_id_(main_thread_id), timeout_(timeout) { + // watchdog is disabled until it is kicked for the first time + } + + void kick(); + + private: + void timeout_expired() final; + + td::thread::id main_thread_id_; + double timeout_; + double last_kick_time_ = 0.0; +}; + +} // namespace telegram_bot_api diff --git a/telegram-bot-api/telegram-bot-api.cpp b/telegram-bot-api/telegram-bot-api.cpp index 6228a59..419dfcf 100644 --- a/telegram-bot-api/telegram-bot-api.cpp +++ b/telegram-bot-api/telegram-bot-api.cpp @@ -11,6 +11,7 @@ #include "telegram-bot-api/HttpStatConnection.h" #include "telegram-bot-api/Query.h" #include "telegram-bot-api/Stats.h" +#include "telegram-bot-api/Watchdog.h" #include "td/telegram/ClientActor.h" @@ -462,7 +463,7 @@ int main(int argc, char *argv[]) { // << (td::GitInfo::is_dirty() ? "(dirty)" : "") << " started"; LOG(WARNING) << "Bot API " << parameters->version_ << " server started"; - const int threads_n = 5; // +3 for Td, one for slow HTTP connections and one for DNS resolving + const int threads_n = 6; // +3 for Td, one for watchdog, one for slow HTTP connections, one for DNS resolving td::ConcurrentScheduler sched; sched.init(threads_n); @@ -474,6 +475,7 @@ int main(int argc, char *argv[]) { auto client_manager = sched.create_actor_unsafe(0, "ClientManager", std::move(parameters), token_range).release(); + sched .create_actor_unsafe( 0, "HttpServer", http_ip_address, http_port, @@ -482,6 +484,7 @@ int main(int argc, char *argv[]) { td::create_actor("HttpConnection", client_manager, shared_data)); }) .release(); + if (http_stat_port != 0) { sched .create_actor_unsafe( @@ -492,8 +495,14 @@ int main(int argc, char *argv[]) { }) .release(); } + + constexpr double WATCHDOG_TIMEOUT = 0.5; + auto watchdog_id = + sched.create_actor_unsafe(threads_n - 2, "Watchdog", td::this_thread::get_id(), WATCHDOG_TIMEOUT); + sched.start(); + double next_watchdog_kick_time = start_time; double next_cron_time = start_time; double last_dump_time = start_time - 1000.0; double last_tqueue_gc_time = start_time - 1000.0; @@ -503,7 +512,7 @@ int main(int argc, char *argv[]) { std::atomic_bool can_quit{false}; ServerCpuStat::instance(); // create ServerCpuStat instance while (true) { - sched.run_main(next_cron_time - td::Time::now()); + sched.run_main(td::min(next_cron_time, next_watchdog_kick_time) - td::Time::now()); if (!need_reopen_log.test_and_set()) { td::log_interface->after_rotation(); @@ -519,6 +528,7 @@ int main(int argc, char *argv[]) { dump_statistics(shared_data, net_query_stats); close_flag = true; auto guard = sched.get_main_guard(); + watchdog_id.reset(); send_closure(client_manager, &ClientManager::close, td::PromiseCreator::lambda([&can_quit](td::Unit) { can_quit.store(true); td::Scheduler::instance()->yield(); @@ -557,6 +567,12 @@ int main(int argc, char *argv[]) { ServerCpuStat::update(now); } + if (now >= start_time + 600) { + auto guard = sched.get_main_guard(); + send_closure(watchdog_id, &Watchdog::kick); + next_watchdog_kick_time = now + WATCHDOG_TIMEOUT / 2; + } + if (now > last_tqueue_gc_time + 60.0) { auto unix_time = shared_data->get_unix_time(now); LOG(INFO) << "Run TQueue GC at " << unix_time;