From c68e20dd4eaf552022167dad214b85f81ca8b229 Mon Sep 17 00:00:00 2001 From: Yoann Lecuyer Date: Thu, 11 Mar 2021 18:17:44 +0100 Subject: [PATCH] Add debug to worker timeout and startup (#2559) Logs reason for worker timeout and the startup time when worker boots --- History.md | 1 + lib/puma/cluster.rb | 9 +++++++-- lib/puma/cluster/worker_handle.rb | 4 ++++ test/helpers/integration.rb | 6 +++++- test/test_integration_cluster.rb | 8 ++++---- test/test_preserve_bundler_env.rb | 2 +- test/test_worker_gem_independence.rb | 10 +++++----- 7 files changed, 27 insertions(+), 13 deletions(-) diff --git a/History.md b/History.md index 15fc6d8e8b..38e1004d97 100644 --- a/History.md +++ b/History.md @@ -3,6 +3,7 @@ * Features * Your feature goes here (#Github Number) * Warn when running Cluster mode with a single worker (#2565) + * Add reason to worker time out and startup time when worked boots ([#2528]) * Bugfixes * Your bugfix goes here (#Github Number) diff --git a/lib/puma/cluster.rb b/lib/puma/cluster.rb index 6bd25b4ab8..d31e8f76e7 100644 --- a/lib/puma/cluster.rb +++ b/lib/puma/cluster.rb @@ -436,7 +436,7 @@ def run case req when "b" w.boot! - log "- Worker #{w.index} (PID: #{pid}) booted, phase: #{w.phase}" + log "- Worker #{w.index} (PID: #{pid}) booted in #{w.uptime.round(2)}s, phase: #{w.phase}" @next_check = Time.now when "e" # external term, see worker method, Signal.trap "SIGTERM" @@ -510,7 +510,12 @@ def wait_workers def timeout_workers @workers.each do |w| if !w.term? && w.ping_timeout <= Time.now - log "! Terminating timed out worker: #{w.pid}" + details = if w.booted? + "(worker failed to check in within #{@options[:worker_timeout]} seconds)" + else + "(worker failed to boot within #{@options[:worker_boot_timeout]} seconds)" + end + log "! Terminating timed out worker #{details}: #{w.pid}" w.kill end end diff --git a/lib/puma/cluster/worker_handle.rb b/lib/puma/cluster/worker_handle.rb index ea3814e0cb..be73da95e8 100644 --- a/lib/puma/cluster/worker_handle.rb +++ b/lib/puma/cluster/worker_handle.rb @@ -31,6 +31,10 @@ def booted? @stage == :booted end + def uptime + Time.now - started_at + end + def boot! @last_checkin = Time.now @stage = :booted diff --git a/test/helpers/integration.rb b/test/helpers/integration.rb index ee3425c121..3cdfccf688 100644 --- a/test/helpers/integration.rb +++ b/test/helpers/integration.rb @@ -46,6 +46,10 @@ def teardown private + def silent_and_checked_system_command(*args) + assert(system(*args, out: File::NULL, err: File::NULL)) + end + def cli_server(argv, unix: false, config: nil) if config config_file = Tempfile.new(%w(config .rb)) @@ -160,7 +164,7 @@ def read_body(connection, time_out = 10) # gets worker pids from @server output def get_worker_pids(phase = 0, size = workers) pids = [] - re = /PID: (\d+)\) booted, phase: #{phase}/ + re = /PID: (\d+)\) booted in [.0-9]+s, phase: #{phase}/ while pids.size < size if pid = @server.gets[re, 1] pids << pid diff --git a/test/test_integration_cluster.rb b/test/test_integration_cluster.rb index fc22588f69..4cb0867b91 100644 --- a/test/test_integration_cluster.rb +++ b/test/test_integration_cluster.rb @@ -148,13 +148,13 @@ def test_stuck_phased_restart def test_worker_boot_timeout timeout = 1 - worker_timeout(timeout, 2, "worker_boot_timeout #{timeout}; on_worker_boot { sleep #{timeout + 1} }") + worker_timeout(timeout, 2, "worker failed to boot within \\\d+ seconds", "worker_boot_timeout #{timeout}; on_worker_boot { sleep #{timeout + 1} }") end def test_worker_timeout skip 'Thread#name not available' unless Thread.current.respond_to?(:name) timeout = Puma::Const::WORKER_CHECK_INTERVAL + 1 - worker_timeout(timeout, 1, <