From 54cf94ea59a0026a01c95b4ccc61c07a6fe2063e Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Mon, 6 Jun 2022 23:52:32 +0000 Subject: [PATCH] Fix per-slot timer in presence of clock changes (#3243) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Issue Addressed Fixes a timing issue that results in spurious fork choice notifier failures: ``` WARN Error signalling fork choice waiter slot: 3962270, error: ForkChoiceSignalOutOfOrder { current: Slot(3962271), latest: Slot(3962270) }, service: beacon ``` There’s a fork choice run that is scheduled to run at the start of every slot by the `timer`, which creates a 12s interval timer when the beacon node starts up. The problem is that if there’s a bit of clock drift that gets corrected via NTP (or a leap second for that matter) then these 12s intervals will cease to line up with the start of the slot. This then creates the mismatch in slot number that we see above. Lighthouse also runs fork choice 500ms before the slot begins, and these runs are what is conflicting with the start-of-slot runs. This means that the warning in current versions of Lighthouse is mostly cosmetic because fork choice is up to date with all but the most recent 500ms of attestations (which usually isn’t many). ## Proposed Changes Fix the per-slot timer so that it continually re-calculates the duration to the start of the next slot and waits for that. A side-effect of this change is that we may skip slots if the per-slot task takes >12s to run, but I think this is an unlikely scenario and an acceptable compromise. --- beacon_node/client/src/builder.rs | 7 +------ beacon_node/timer/src/lib.rs | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 59f1bebdb41..1f02ec7b3c3 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -485,13 +485,8 @@ where .beacon_chain .clone() .ok_or("node timer requires a beacon chain")?; - let seconds_per_slot = self - .chain_spec - .as_ref() - .ok_or("node timer requires a chain spec")? - .seconds_per_slot; - spawn_timer(context.executor, beacon_chain, seconds_per_slot) + spawn_timer(context.executor, beacon_chain) .map_err(|e| format!("Unable to start node timer: {}", e))?; Ok(self) diff --git a/beacon_node/timer/src/lib.rs b/beacon_node/timer/src/lib.rs index 9c6bf1ca870..bf2acaf5bb5 100644 --- a/beacon_node/timer/src/lib.rs +++ b/beacon_node/timer/src/lib.rs @@ -6,29 +6,29 @@ use beacon_chain::{BeaconChain, BeaconChainTypes}; use slog::{debug, info, warn}; use slot_clock::SlotClock; use std::sync::Arc; -use std::time::Duration; -use tokio::time::{interval_at, Instant}; +use tokio::time::sleep; /// Spawns a timer service which periodically executes tasks for the beacon chain pub fn spawn_timer( executor: task_executor::TaskExecutor, beacon_chain: Arc>, - seconds_per_slot: u64, ) -> Result<(), &'static str> { let log = executor.log(); - let start_instant = Instant::now() - + beacon_chain - .slot_clock - .duration_to_next_slot() - .ok_or("slot_notifier unable to determine time to next slot")?; - - // Warning: `interval_at` panics if `seconds_per_slot` = 0. - let mut interval = interval_at(start_instant, Duration::from_secs(seconds_per_slot)); let per_slot_executor = executor.clone(); + let timer_future = async move { let log = per_slot_executor.log().clone(); loop { - interval.tick().await; + let duration_to_next_slot = match beacon_chain.slot_clock.duration_to_next_slot() { + Some(duration) => duration, + None => { + warn!(log, "Unable to determine duration to next slot"); + return; + } + }; + + sleep(duration_to_next_slot).await; + let chain = beacon_chain.clone(); if let Some(handle) = per_slot_executor .spawn_blocking_handle(move || chain.per_slot_task(), "timer_per_slot_task")