diff --git a/octane/tests/test_maintenance.py b/octane/tests/test_maintenance.py index a7f88443..de895913 100644 --- a/octane/tests/test_maintenance.py +++ b/octane/tests/test_maintenance.py @@ -11,6 +11,8 @@ # under the License. import mock +import pytest +from xml.etree import ElementTree from octane.util import maintenance from octane.util import subprocess @@ -21,6 +23,32 @@ def test_get_crm_services(): assert sorted(res) == CRM_XML_PARSE_RESULT +@pytest.mark.parametrize("resource_list,status,expected_result", [ + (["master_p_rabbitmq-server", "vip__management_old"], False, False), + (["master_p_rabbitmq-server", "vip__management_old"], True, False), + (["master_p_rabbitmq-server", "p_ceilometer-alarm-evaluator"], False, + True), + (["clone_p_neutron-metadata-agent", "vip__management_old", + "group__zabbix-server"], True, True), + (["test1", "vip__management_old"], True, False), + (["test1", "test2"], False, True), +]) +def test_resources_synced(resource_list, status, expected_result): + res = maintenance.is_resources_synced(resource_list, CRM_XML_STATUS_SAMPLE, + status) + assert res is expected_result + + +def test_resources_status(): + data = ElementTree.fromstring(CRM_XML_STATUS_SAMPLE) + resources = next(el for el in data if el.tag == 'resources') + + result = [] + for resource in resources: + result.append(maintenance.is_resource_active(resource)) + assert result == [True, False, False, True, True] + + def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output, mock_subprocess, node): get_one_controller = mocker.patch('octane.util.env.get_one_controller') @@ -34,6 +62,9 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output, mocker.patch('time.sleep') + wait_for_services = \ + mocker.patch.object(maintenance, 'wait_for_corosync_services_sync') + maintenance.stop_corosync_services('env') assert not mock_subprocess.called @@ -41,6 +72,8 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output, mock_ssh_call_output.assert_called_once_with(['cibadmin', '--query', '--scope', 'resources'], node=node) + assert wait_for_services.call_args_list == \ + [mock.call('env', ['s1', 's2'], 'stop')] assert mock_ssh_call.call_args_list == [ mock.call(['crm', 'resource', 'stop', 's1'], node=node), mock.call(['crm', 'resource', 'stop', 's1'], node=node), @@ -57,10 +90,16 @@ def test_start_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output, mock_ssh_call.side_effect = \ [None, subprocess.CalledProcessError(1, 'cmd'), None] + wait_for_services = \ + mocker.patch.object(maintenance, 'wait_for_corosync_services_sync') + maintenance.start_corosync_services('env') mock_ssh_call_output.assert_called_once_with( ['cibadmin', '--query', '--scope', 'resources'], node=node) + + assert wait_for_services.call_args_list == \ + [mock.call('env', ['test_service1', 'test_service2'], 'start')] assert mock_ssh_call.call_args_list == [ mock.call(['crm', 'resource', 'start', 'test_service1'], node=node), mock.call(['crm', 'resource', 'start', 'test_service2'], node=node), @@ -458,12 +497,61 @@ CRM_XML_SAMPLE = """ """[1:] # noqa CRM_XML_PARSE_RESULT = [ + 'clone_p_dns', + 'clone_p_haproxy', 'clone_p_heat-engine', + 'clone_p_mysql', 'clone_p_neutron-dhcp-agent', 'clone_p_neutron-l3-agent', 'clone_p_neutron-metadata-agent', 'clone_p_neutron-plugin-openvswitch-agent', + 'clone_p_ntp', + 'clone_p_vrouter', 'group__zabbix-server', + 'master_p_conntrackd', + 'master_p_rabbitmq-server', 'p_ceilometer-agent-central', 'p_ceilometer-alarm-evaluator', + 'vip__management', + 'vip__public', + 'vip__vrouter', + 'vip__vrouter_pub' ] +CRM_XML_STATUS_SAMPLE = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +"""[1:] # noqa diff --git a/octane/util/maintenance.py b/octane/util/maintenance.py index f84903c1..07a7d6f5 100644 --- a/octane/util/maintenance.py +++ b/octane/util/maintenance.py @@ -79,29 +79,93 @@ _default_exclude_services = ('p_mysql', 'p_haproxy', 'p_dns', 'p_ntp', 'vip', 'clone_p_vrouter') -def get_crm_services(status_out, exclude=_default_exclude_services): +def get_crm_services(status_out): data = ElementTree.fromstring(status_out) for resource in data: - name = resource.get('id') - if any(service in name for service in exclude): - continue - yield name + yield resource.get('id') + + +def start_corosync_services(env): + manage_corosync_services(env, 'start') def stop_corosync_services(env): + manage_corosync_services(env, 'stop') + + +def manage_corosync_services(env, status): node = env_util.get_one_controller(env) status_out = ssh.call_output(['cibadmin', '--query', '--scope', 'resources'], node=node) - for service in get_crm_services(status_out): + services_list = [] + for res in get_crm_services(status_out): + if any(service in res for service in _default_exclude_services): + continue + services_list.append(res) + + for service in services_list: while True: try: - ssh.call(['crm', 'resource', 'stop', service], + ssh.call(['crm', 'resource', status, service], node=node) except subprocess.CalledProcessError: - pass + # Sometimes pacemaker rejects part of requests what it is + # not able to process. Sleep was added to mitigate this risk. + time.sleep(1) else: break - time.sleep(60) + wait_for_corosync_services_sync(env, services_list, status) + + +def wait_for_corosync_services_sync(env, resource_list, status, + timeout=1200, check_freq=20): + status_bool = status == 'start' + node = env_util.get_one_controller(env) + started_at = time.time() + while True: + crm_out = ssh.call_output(['crm_mon', '--as-xml'], node=node) + if is_resources_synced(resource_list, crm_out, status_bool): + return + if time.time() - started_at >= timeout: + raise Exception("Timeout waiting for corosync cluster for env %s" + " to be synced" % env.id) + time.sleep(check_freq) + + +def is_resources_synced(resources, crm_out, status): + def get_resource(resources, resource_id): + for resource in resources: + if resource.get('id') == resource_id: + return resource + return None + + data = ElementTree.fromstring(crm_out) + mon_resources = data.find('resources') + for resource in resources: + res = get_resource(mon_resources, resource) + if not (is_resource_active(res) is status): + return False + return True + + +# Resources are fetching from the output of 'crm_mon' command. This command +# doesn't return resource if it's not started and we can consider 'absent' +# resource as disabled. +def is_resource_active(resource): + if resource is None: + return False + if resource.tag == 'resource': + return is_primitive_active(resource) + for primitive in resource: + if not is_primitive_active(primitive): + return False + return True + + +def is_primitive_active(resource): + if resource.get('active') == 'true': + return True + return False def stop_upstart_services(env): @@ -129,22 +193,6 @@ def stop_upstart_services(env): ssh.call(['stop', service], node=node) -def start_corosync_services(env): - node = next(env_util.get_controllers(env)) - status_out = ssh.call_output(['cibadmin', '--query', '--scope', - 'resources'], node=node) - for service in get_crm_services(status_out): - while True: - try: - ssh.call(['crm', 'resource', 'start', service], - node=node) - except subprocess.CalledProcessError: - pass - else: - break - time.sleep(60) - - def start_upstart_services(env): controllers = list(env_util.get_controllers(env)) for node in controllers: @@ -179,3 +227,15 @@ def start_cluster(env): for node in controllers: for cmd in cmds: ssh.call(cmd, node=node) + # When we start cluster we should wait while resources from constant + # `_default_exclude_services` become up and running. BTW, We don't touch + # these resources in stop/start corosync resources methods at all. + node = env_util.get_one_controller(env) + status_out = ssh.call_output(['cibadmin', '--query', '--scope', + 'resources'], node=node) + services_list = [] + for res in get_crm_services(status_out): + if any(service in res for service in _default_exclude_services): + services_list.append(res) + + wait_for_corosync_services_sync(env, services_list, 'start')