From 7faabd0f57722c75bd46e186d7ffc5b53a12da53 Mon Sep 17 00:00:00 2001 From: Franz Liedke Date: Mon, 6 Sep 2021 21:52:49 +0200 Subject: [PATCH] Reconnect on "UNBLOCKED force unblock" errors These errors can occur during Sidekiq's long-running job fetching command. This uses Redis' blocking BRPOP primitive. On failover in a cluster setup, these commands are interrupted by the server. This error causes the worker threads to be restarted, but as they are bubbled up to the top, they cause a lot of spam in our error logging systems. As related errors from other commands are being handled (see #2550 and #4495) this way, it seems senbile to also handle this one. --- lib/sidekiq.rb | 3 ++- test/test_sidekiq.rb | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/sidekiq.rb b/lib/sidekiq.rb index 3ee4f9d26a..6f5d3c80f7 100644 --- a/lib/sidekiq.rb +++ b/lib/sidekiq.rb @@ -100,7 +100,8 @@ def self.redis # 2550 Failover can cause the server to become a replica, need # to disconnect and reopen the socket to get back to the primary. # 4495 Use the same logic if we have a "Not enough replicas" error from the primary - if retryable && ex.message =~ /READONLY|NOREPLICAS/ + # 4985 Use the same logic when a blocking command is force-unblocked + if retryable && ex.message =~ /READONLY|NOREPLICAS|master -> replica/ conn.disconnect! retryable = false retry diff --git a/test/test_sidekiq.rb b/test/test_sidekiq.rb index 7a1c625651..1852a6261b 100644 --- a/test/test_sidekiq.rb +++ b/test/test_sidekiq.rb @@ -96,6 +96,16 @@ assert_equal 2, counts.size assert_equal counts[0] + 1, counts[1] end + + it 'reconnects if instance state changed' do + counts = [] + Sidekiq.redis do |c| + counts << c.info['total_connections_received'].to_i + raise Redis::CommandError, "UNBLOCKED force unblock from blocking operation, instance state changed (master -> replica?)" if counts.size == 1 + end + assert_equal 2, counts.size + assert_equal counts[0] + 1, counts[1] + end end describe 'redis info' do