From 8bb84b038ca4771701c33b597234e8f8b9d74b52 Mon Sep 17 00:00:00 2001 From: Alvaro Carvajal Date: Mon, 3 May 2021 17:11:53 +0200 Subject: [PATCH 1/2] Add a CPU usage check for HAWK/PUMA This adds a CPU usage check on the ha/check_hawk test module while a client running the ha/hawk_gui test module is interacting with HAWK. It will soft fail with bsc#1179609 (HAWK/PUMA consume a considerable amount of CPU) if HAWK/PUMA CPU usage is over 50%. --- tests/ha/barrier_init.pm | 8 +++++--- tests/ha/check_hawk.pm | 38 ++++++++++++++++++++++++++++++++++++++ tests/ha/hawk_gui.pm | 2 ++ 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/tests/ha/barrier_init.pm b/tests/ha/barrier_init.pm index 29634828945c..17bea28ca674 100644 --- a/tests/ha/barrier_init.pm +++ b/tests/ha/barrier_init.pm @@ -108,9 +108,11 @@ sub run { barrier_create("PACEMAKER_CTS_CHECKED_$cluster_name", $num_nodes + 1); # HAWK_GUI_ barriers also have to wait in the client - barrier_create("HAWK_GUI_INIT_$cluster_name", $num_nodes + 1); - barrier_create("HAWK_GUI_CHECKED_$cluster_name", $num_nodes + 1); - barrier_create("HAWK_FENCE_$cluster_name", $num_nodes + 1); + barrier_create("HAWK_GUI_INIT_$cluster_name", $num_nodes + 1); + barrier_create("HAWK_GUI_CHECKED_$cluster_name", $num_nodes + 1); + barrier_create("HAWK_GUI_CPU_TEST_START_$cluster_name", $num_nodes + 1); + barrier_create("HAWK_GUI_CPU_TEST_FINISH_$cluster_name", $num_nodes + 1); + barrier_create("HAWK_FENCE_$cluster_name", $num_nodes + 1); # CTDB barriers barrier_create("CTDB_INIT_$cluster_name", $num_nodes + 1); diff --git a/tests/ha/check_hawk.pm b/tests/ha/check_hawk.pm index 636fe2c4e594..d6c49f602dda 100644 --- a/tests/ha/check_hawk.pm +++ b/tests/ha/check_hawk.pm @@ -19,6 +19,43 @@ use lockapi; use hacluster qw(get_cluster_name is_node); use utils 'systemctl'; use version_utils 'is_sle'; +use List::Util qw(sum); + +sub check_hawk_cpu { + my $cluster_name = get_cluster_name; + my @cpu_usage = (); + + barrier_wait("HAWK_GUI_CPU_TEST_START_$cluster_name"); + while (!barrier_try_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name")) { + # Wrapping script_output in eval { } as node can be fenced by hawk test from client. + # In fenced node, script_output will croak and kill the test. This prevents it + my $metric = eval { + script_output q@ps axo pcpu,comm | awk '/hawk|puma/ {total += $1} END {print "cpu_usage["total"]"}'@, + proceed_on_failure => 1; + }; + if ($@) { + # When script_output croaks, command may be typed when SUT is on the grub menu + # and either boot the system or get into grub editing. If system has booted, + # force a new fence; if it's still in grub menu, do nothing; otherwise send an + # ESC to return SUT to grub menu and exit the loop + if (check_screen('linux-login')) { + reset_consoles; + select_console('root-console'); + enter_cmd 'echo b > /proc/sysrq-trigger'; + } + else { + send_key 'esc' unless check_screen('grub2'); + } + barrier_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name"); + last; + } + push @cpu_usage, $metric =~ /cpu_usage\[([\d\.]+)\]/; + sleep bmwqemu::scale_timeout(1); + } + my $cpu_usage = sum(@cpu_usage) / @cpu_usage; + record_info "CPU usage", "HAWK/PUMA CPU usage was $cpu_usage"; + record_soft_failure "bsc#1179609 - HAWK/PUMA consume a considerable amount of CPU" if ($cpu_usage >= 50); +} sub run { my $cluster_name = get_cluster_name; @@ -55,6 +92,7 @@ sub run { # If testing HAWK GUI, also wait for those barriers if (get_var('HAWKGUI_TEST_ROLE')) { barrier_wait("HAWK_GUI_INIT_$cluster_name"); + check_hawk_cpu; barrier_wait("HAWK_GUI_CHECKED_$cluster_name"); } diff --git a/tests/ha/hawk_gui.pm b/tests/ha/hawk_gui.pm index f028c9879920..0ff75adf4b64 100644 --- a/tests/ha/hawk_gui.pm +++ b/tests/ha/hawk_gui.pm @@ -86,6 +86,7 @@ sub run { add_to_known_hosts($node2); assert_script_run "mkdir -m 1777 $path"; assert_script_run "xhost +"; + barrier_wait("HAWK_GUI_CPU_TEST_START_$cluster_name"); my $docker_cmd = "docker run --rm --name test --ipc=host -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=\$DISPLAY -v \$PWD/$path:/$path "; $docker_cmd .= "$docker_image -b $browser -H $node1 -S $node2 -s $testapi::password -r /$results --virtual-ip $virtual_ip"; enter_cmd "$docker_cmd | tee $logs; echo $pyscr-\$PIPESTATUS > $retcode"; @@ -114,6 +115,7 @@ sub run { save_screenshot; assert_screen "generic-desktop"; + barrier_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name"); # Error, log and results handling select_console 'user-console'; From 00350f818501f66e39960bb52e6d2ee050f66d4f Mon Sep 17 00:00:00 2001 From: Alvaro Carvajal Date: Wed, 5 May 2021 10:32:24 +0200 Subject: [PATCH 2/2] Add CPU usage check when HAWK is idle --- tests/ha/check_hawk.pm | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/ha/check_hawk.pm b/tests/ha/check_hawk.pm index d6c49f602dda..2aa4fc57def1 100644 --- a/tests/ha/check_hawk.pm +++ b/tests/ha/check_hawk.pm @@ -1,6 +1,6 @@ # SUSE's openQA tests # -# Copyright (c) 2018-2019 SUSE LLC +# Copyright (c) 2018-2021 SUSE LLC # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright @@ -17,21 +17,26 @@ use warnings; use testapi; use lockapi; use hacluster qw(get_cluster_name is_node); -use utils 'systemctl'; -use version_utils 'is_sle'; +use utils qw(systemctl); +use version_utils qw(is_sle); use List::Util qw(sum); sub check_hawk_cpu { - my $cluster_name = get_cluster_name; - my @cpu_usage = (); + my %args = @_; + my $cluster_name = get_cluster_name; + my @cpu_usage = (); + my $threshold = $args{idle_check} ? 10 : 50; + my $idle_check_loops = 60; + + # Do not wait on barriers if checking CPU usage while HAWK is idle + barrier_wait("HAWK_GUI_CPU_TEST_START_$cluster_name") unless $args{idle_check}; - barrier_wait("HAWK_GUI_CPU_TEST_START_$cluster_name"); - while (!barrier_try_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name")) { + while ($args{idle_check} || !barrier_try_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name")) { # Wrapping script_output in eval { } as node can be fenced by hawk test from client. # In fenced node, script_output will croak and kill the test. This prevents it my $metric = eval { script_output q@ps axo pcpu,comm | awk '/hawk|puma/ {total += $1} END {print "cpu_usage["total"]"}'@, - proceed_on_failure => 1; + proceed_on_failure => 1, quiet => 1; }; if ($@) { # When script_output croaks, command may be typed when SUT is on the grub menu @@ -46,15 +51,18 @@ sub check_hawk_cpu { else { send_key 'esc' unless check_screen('grub2'); } - barrier_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name"); + barrier_wait("HAWK_GUI_CPU_TEST_FINISH_$cluster_name") unless $args{idle_check}; last; } push @cpu_usage, $metric =~ /cpu_usage\[([\d\.]+)\]/; sleep bmwqemu::scale_timeout(1); + last if ($args{idle_check} && (--$idle_check_loops < 0)); } my $cpu_usage = sum(@cpu_usage) / @cpu_usage; - record_info "CPU usage", "HAWK/PUMA CPU usage was $cpu_usage"; - record_soft_failure "bsc#1179609 - HAWK/PUMA consume a considerable amount of CPU" if ($cpu_usage >= 50); + my $msg = "HAWK/PUMA CPU usage was $cpu_usage"; + $msg .= " while idle" if $args{idle_check}; + record_info "CPU usage", $msg; + record_soft_failure "bsc#1179609 - HAWK/PUMA consume a considerable amount of CPU" if ($cpu_usage >= $threshold); } sub run { @@ -87,6 +95,8 @@ sub run { # Keep a screenshot for this test save_screenshot; + check_hawk_cpu(idle_check => 1); + barrier_wait("HAWK_CHECKED_$cluster_name"); # If testing HAWK GUI, also wait for those barriers