From 92a7c7275a78186749e80076222a2d46517bfa2c Mon Sep 17 00:00:00 2001 From: Eduardo Olivares Date: Fri, 10 Mar 2023 08:41:20 +0100 Subject: [PATCH] Improve faults tests' neutron agent health checks One of the verifications part of the cloud health checks executed before and after the disruptions from the faults tests was to check that all the neutron agents are alive. This patch adds an extra verification to this: the agents have to be consistently alive along time. Besides that, the number of reruns from the faults test test_controllers_shutdown is set to 0 because it makes analysing its failures more complicated. This test will still be flaky because we need to determine whether there are more issues with it. Change-Id: I354c66453493339622f99c0d18e1ff98f9f609e0 --- tobiko/openstack/tests/__init__.py | 2 ++ tobiko/openstack/tests/_neutron.py | 35 +++++++++++++++++++ tobiko/tests/faults/ha/test_cloud_recovery.py | 4 +-- tobiko/tests/sanity/neutron/test_agents.py | 16 +-------- 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/tobiko/openstack/tests/__init__.py b/tobiko/openstack/tests/__init__.py index bf21568d7..6e869d063 100644 --- a/tobiko/openstack/tests/__init__.py +++ b/tobiko/openstack/tests/__init__.py @@ -22,6 +22,8 @@ from tobiko.openstack.tests import _nova InvalidDBConnString = _neutron.InvalidDBConnString RAFTStatusError = _neutron.RAFTStatusError test_neutron_agents_are_alive = _neutron.test_neutron_agents_are_alive +test_alive_agents_are_consistent_along_time = ( + _neutron.test_alive_agents_are_consistent_along_time) test_ovn_dbs_validations = _neutron.test_ovn_dbs_validations test_ovs_bridges_mac_table_size = _neutron.test_ovs_bridges_mac_table_size test_ovs_namespaces_are_absent = _neutron.test_ovs_namespaces_are_absent diff --git a/tobiko/openstack/tests/_neutron.py b/tobiko/openstack/tests/_neutron.py index d654292eb..62c9ca1e5 100644 --- a/tobiko/openstack/tests/_neutron.py +++ b/tobiko/openstack/tests/_neutron.py @@ -94,6 +94,41 @@ def test_neutron_agents_are_alive(timeout=420., interval=5.) \ return agents +def test_alive_agents_are_consistent_along_time(previous_alive_agents=None): + test_case = tobiko.get_test_case() + if previous_alive_agents is None: + # the following dict of agents is obtained when: + # - the list_agents request is replied with 200 + # - the list is not empty + # - no agents are dead + alive_agents = {agent['id']: agent + for agent in test_neutron_agents_are_alive()} + else: + alive_agents = previous_alive_agents + + for attempt in tobiko.retry(sleep_time=5., count=5): + agents = neutron.list_agents() + actual = {agent['id']: agent + for agent in agents} + + # any dead agents? If yes, fail now + dead_agents = agents.with_items(alive=False) + test_case.assertEqual( + [], dead_agents, "Some neutron agents died") + + if len(actual) > len(alive_agents): + LOG.debug('Some new agents appeared! It seems not all the agents ' + 'had been started yet, so let\'s restart this check') + return test_alive_agents_are_consistent_along_time(actual) + + # any agent disappeared? If yes, fail now + test_case.assertEqual( + set(alive_agents), set(actual), 'Some agents disappeared') + + if attempt.is_last: + break + + def ovn_dbs_vip_bindings(test_case): ovn_conn_str = get_ovn_db_connections() # ovn db sockets might be centrillized or distributed diff --git a/tobiko/tests/faults/ha/test_cloud_recovery.py b/tobiko/tests/faults/ha/test_cloud_recovery.py index f668da311..25e0b389d 100644 --- a/tobiko/tests/faults/ha/test_cloud_recovery.py +++ b/tobiko/tests/faults/ha/test_cloud_recovery.py @@ -45,7 +45,7 @@ def overcloud_health_checks(passive_checks_only=False, check_pacemaker_resources_health() check_overcloud_processes_health() nova.check_nova_services_health() - tests.test_neutron_agents_are_alive() + tests.test_alive_agents_are_consistent_along_time() if not passive_checks_only: # create a uniq stack check_vm_create() @@ -265,7 +265,7 @@ class DisruptTripleoNodesTest(testtools.TestCase): cloud_disruptions.request_galera_sst() OvercloudHealthCheck.run_after() - @pytest.mark.flaky(reruns=3, reruns_delay=60) + @pytest.mark.flaky(reruns=0) def test_controllers_shutdown(self): OvercloudHealthCheck.run_before() cloud_disruptions.test_controllers_shutdown() diff --git a/tobiko/tests/sanity/neutron/test_agents.py b/tobiko/tests/sanity/neutron/test_agents.py index 91c4a5d61..d6126d283 100644 --- a/tobiko/tests/sanity/neutron/test_agents.py +++ b/tobiko/tests/sanity/neutron/test_agents.py @@ -17,8 +17,6 @@ from __future__ import absolute_import import pytest import testtools -import tobiko -from tobiko.openstack import neutron from tobiko.openstack import tests @@ -29,16 +27,4 @@ class NeutronAgentTest(testtools.TestCase): tests.test_neutron_agents_are_alive() def test_alive_agents_are_consistent_along_time(self): - alive_agents = {agent['id']: agent - for agent in tests.test_neutron_agents_are_alive()} - for attempt in tobiko.retry(sleep_time=5., count=5): - agents = neutron.list_agents() - actual = {agent['id']: agent - for agent in agents} - self.assertEqual(set(alive_agents), set(actual), - 'Agents appeared or disappeared') - dead_agents = agents.with_items(alive=False) - self.assertEqual([], dead_agents, - "Neutron agent(s) no more alive") - if attempt.is_last: - break + tests.test_alive_agents_are_consistent_along_time()