Skip to content

Commit

Permalink
[#162275 Make the SciShield backup layer more resilient (#4240)
Browse files Browse the repository at this point in the history
# Release Notes

We're seeing frequent and inconsistent Net::OpenTimeout errors that are causing `api_unavailable?` to fail, preventing the sync task from running.
By making `api_unavailable?` more resilient, we have a better chance of keeping the backup layer more up to date.
We are in communication with SciShield customer service as well to see if they can help identify the root cause.
  • Loading branch information
giladshanan committed May 9, 2024
1 parent 654caff commit 4338bdd
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 66 deletions.
Expand Up @@ -12,47 +12,7 @@ def synchronize
Rails.logger.error(msg)
Rollbar.error(msg) if defined?(Rollbar)
else
user_certs = {}

# In some cases, making many requests in a row results in the API not
# responding. Making requests in batches with sleep in between seems
# to make the API work in most cases where this is a problem. `sleep`
# is put first here because `api_unavailable?` will have already made
# 10 API requests.
users.in_batches(of: batch_size) do |user_batch|
sleep(batch_sleep_time)

user_batch.each do |user|
adapter = ScishieldApiAdapter.new(user, api_client)
retries = 0
retry_sleep_time = batch_sleep_time

begin
cert_names = adapter.certified_course_names_from_api
user_certs[user.id.to_s] = cert_names if cert_names.presence
rescue => e
# Sometimes the sleep between batches is not enough and failures
# happen. Sleeping before retrying an individual request seems to
# allow that request to succeed. Each retry sleeps 25% longer than
# the previous try.
if retries < retry_max
retry_sleep_time *= 1.25
msg = "ScishieldTrainingSynchronizer#synchronize request for user id #{user.id} failed, retrying in #{retry_sleep_time} seconds. Error: #{e.message}"

Rails.logger.warn(msg)
Rollbar.warn(msg) if defined?(Rollbar)

retries += 1
sleep(retry_sleep_time)
retry
else
Rails.logger.error(e.message)
Rollbar.error(e.message) if defined?(Rollbar)
end
end
end
end

user_certs = fetch_user_certs
ScishieldTraining.transaction do
ScishieldTraining.delete_all

Expand All @@ -78,11 +38,79 @@ def synchronize
end

def api_unavailable?
# Test API responses for 10 random users
users.sample(batch_size).map do |user|
sleep(batch_sleep_time / 2)
api_client.invalid_response?(user.email)
end.all?
retries = 0
begin
# Test API responses for `batch_size` random users
users.sample(batch_size).map do |user|
puts "Testing user: #{user.email}"
response_is_invalid = api_client.invalid_response?(user.email)
puts "Invalid response: #{response_is_invalid}"
response_is_invalid
end.all?
rescue Net::OpenTimeout => e
retries += 1
if retries < retry_max
puts "Timeout error: #{e.message}"
puts "retrying in #{batch_sleep_time / 2} seconds"
sleep(batch_sleep_time / 2) # just doing one batch, so no need to sleep the full time
retry
else
puts "Timeout error: #{e.message}"
puts "Max retries reached, aborting"
true
end
end
end

def fetch_user_certs
certs = {}

# In some cases, making many requests in a row results in the API not
# responding. Making requests in batches with sleep in between seems
# to make the API work in most cases where this is a problem. `sleep`
# is put first here because `api_unavailable?` will have already made
# 10 API requests.
puts users.size
puts batch_size
total_batches = users.size / batch_size
puts "#{total_batches} batches to be processed"
batch_number = 0
users.in_batches(of: batch_size) do |user_batch|
batch_number += 1
puts "starting batch #{batch_number} of #{total_batches}"
sleep(batch_sleep_time)

user_batch.each do |user|
adapter = ScishieldApiAdapter.new(user)
retries = 0
retry_sleep_time = batch_sleep_time

begin
cert_names = adapter.certified_course_names_from_api
certs[user.id.to_s] = cert_names if cert_names.presence
rescue => e
# Sometimes the sleep between batches is not enough and failures
# happen. Sleeping before retrying an individual request seems to
# allow that request to succeed. Each retry sleeps 25% longer than
# the previous try.
if retries < retry_max
retry_sleep_time *= 1.25
msg = "ScishieldTrainingSynchronizer#synchronize request for user id #{user.id} failed, retrying in #{retry_sleep_time} seconds. Error: #{e.message}"

Rails.logger.warn(msg)
Rollbar.warn(msg) if defined?(Rollbar)

retries += 1
sleep(retry_sleep_time)
retry
else
Rails.logger.error(e.message)
Rollbar.error(e.message) if defined?(Rollbar)
end
end
end
end
certs
end

def users
Expand Down
59 changes: 39 additions & 20 deletions spec/system/research_safety/scishield/scishield_timeout_spec.rb
Expand Up @@ -18,35 +18,54 @@
)
end

before do
# This will cause ResearchSafetyAdapters::ScishieldApiClient#certifications_for
# to raise ResearchSafetyAdapters::ScishieldApiError
allow_any_instance_of(ResearchSafetyAdapters::ScishieldApiClient).to(
receive(:training_api_request).and_raise(Net::OpenTimeout)
)
end

describe "viewing a user's certificates" do
describe "api_unavailable?" do
before do
login_as admin
visit facility_user_user_research_safety_certifications_path Facility.cross_facility, user
# Raise a Net::OpenTimeout error when calling ScishieldApiClient#invalid_response?
allow_any_instance_of(ResearchSafetyAdapters::ScishieldApiClient).to(
receive(:invalid_response?).and_raise(Net::OpenTimeout)
)
# We just want to test the retry logic, so we'll set the retry_max to 2
allow_any_instance_of(ResearchSafetyAdapters::ScishieldTrainingSynchronizer).to(
receive(:retry_max).and_return(2)
)
end

it "displays an error on the page" do
expect(page).to have_content I18n.t("services.research_safety_adapters.scishield_api_client.request_failed")
it "returns true instead of failing with a Net::OpenTimeout error" do
expect(ResearchSafetyAdapters::ScishieldTrainingSynchronizer.new.api_unavailable?).to be true
end
end

describe "making a reservation" do
describe "Net::OpenTimeout errors when the user does not have a ScishieldTraining record" do
before do
login_as user
visit facility_path(facility)
click_link instrument.name
# This will cause ResearchSafetyAdapters::ScishieldApiClient#certifications_for
# to raise ResearchSafetyAdapters::ScishieldApiError
allow_any_instance_of(ResearchSafetyAdapters::ScishieldApiClient).to(
receive(:training_api_request).and_raise(Net::OpenTimeout)
)
end

it "displays an error on the page" do
click_button "Create"
expect(page).to have_content I18n.t("services.research_safety_adapters.scishield_api_client.request_failed")
describe "viewing a user's certificates" do
before do
login_as admin
visit facility_user_user_research_safety_certifications_path Facility.cross_facility, user
end

it "displays an error on the page" do
expect(page).to have_content I18n.t("services.research_safety_adapters.scishield_api_client.request_failed")
end
end

describe "making a reservation" do
before do
login_as user
visit facility_path(facility)
click_link instrument.name
end

it "displays an error on the page" do
click_button "Create"
expect(page).to have_content I18n.t("services.research_safety_adapters.scishield_api_client.request_failed")
end
end
end
end

0 comments on commit 4338bdd

Please sign in to comment.