Use more correct approach to detect resources status
Closes-bug: 1499696 Change-Id: Ied72bc68ba41e47334796f09b407ebd8a3fb7f40
This commit is contained in:
parent
2cbab19251
commit
3d06b9d8dd
|
@ -11,6 +11,8 @@
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import mock
|
import mock
|
||||||
|
import pytest
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
from octane.util import maintenance
|
from octane.util import maintenance
|
||||||
from octane.util import subprocess
|
from octane.util import subprocess
|
||||||
|
@ -21,6 +23,32 @@ def test_get_crm_services():
|
||||||
assert sorted(res) == CRM_XML_PARSE_RESULT
|
assert sorted(res) == CRM_XML_PARSE_RESULT
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("resource_list,status,expected_result", [
|
||||||
|
(["master_p_rabbitmq-server", "vip__management_old"], False, False),
|
||||||
|
(["master_p_rabbitmq-server", "vip__management_old"], True, False),
|
||||||
|
(["master_p_rabbitmq-server", "p_ceilometer-alarm-evaluator"], False,
|
||||||
|
True),
|
||||||
|
(["clone_p_neutron-metadata-agent", "vip__management_old",
|
||||||
|
"group__zabbix-server"], True, True),
|
||||||
|
(["test1", "vip__management_old"], True, False),
|
||||||
|
(["test1", "test2"], False, True),
|
||||||
|
])
|
||||||
|
def test_resources_synced(resource_list, status, expected_result):
|
||||||
|
res = maintenance.is_resources_synced(resource_list, CRM_XML_STATUS_SAMPLE,
|
||||||
|
status)
|
||||||
|
assert res is expected_result
|
||||||
|
|
||||||
|
|
||||||
|
def test_resources_status():
|
||||||
|
data = ElementTree.fromstring(CRM_XML_STATUS_SAMPLE)
|
||||||
|
resources = next(el for el in data if el.tag == 'resources')
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for resource in resources:
|
||||||
|
result.append(maintenance.is_resource_active(resource))
|
||||||
|
assert result == [True, False, False, True, True]
|
||||||
|
|
||||||
|
|
||||||
def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
||||||
mock_subprocess, node):
|
mock_subprocess, node):
|
||||||
get_one_controller = mocker.patch('octane.util.env.get_one_controller')
|
get_one_controller = mocker.patch('octane.util.env.get_one_controller')
|
||||||
|
@ -34,6 +62,9 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
||||||
|
|
||||||
mocker.patch('time.sleep')
|
mocker.patch('time.sleep')
|
||||||
|
|
||||||
|
wait_for_services = \
|
||||||
|
mocker.patch.object(maintenance, 'wait_for_corosync_services_sync')
|
||||||
|
|
||||||
maintenance.stop_corosync_services('env')
|
maintenance.stop_corosync_services('env')
|
||||||
|
|
||||||
assert not mock_subprocess.called
|
assert not mock_subprocess.called
|
||||||
|
@ -41,6 +72,8 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
||||||
mock_ssh_call_output.assert_called_once_with(['cibadmin', '--query',
|
mock_ssh_call_output.assert_called_once_with(['cibadmin', '--query',
|
||||||
'--scope', 'resources'],
|
'--scope', 'resources'],
|
||||||
node=node)
|
node=node)
|
||||||
|
assert wait_for_services.call_args_list == \
|
||||||
|
[mock.call('env', ['s1', 's2'], 'stop')]
|
||||||
assert mock_ssh_call.call_args_list == [
|
assert mock_ssh_call.call_args_list == [
|
||||||
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
|
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
|
||||||
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
|
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
|
||||||
|
@ -57,10 +90,16 @@ def test_start_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
||||||
mock_ssh_call.side_effect = \
|
mock_ssh_call.side_effect = \
|
||||||
[None, subprocess.CalledProcessError(1, 'cmd'), None]
|
[None, subprocess.CalledProcessError(1, 'cmd'), None]
|
||||||
|
|
||||||
|
wait_for_services = \
|
||||||
|
mocker.patch.object(maintenance, 'wait_for_corosync_services_sync')
|
||||||
|
|
||||||
maintenance.start_corosync_services('env')
|
maintenance.start_corosync_services('env')
|
||||||
|
|
||||||
mock_ssh_call_output.assert_called_once_with(
|
mock_ssh_call_output.assert_called_once_with(
|
||||||
['cibadmin', '--query', '--scope', 'resources'], node=node)
|
['cibadmin', '--query', '--scope', 'resources'], node=node)
|
||||||
|
|
||||||
|
assert wait_for_services.call_args_list == \
|
||||||
|
[mock.call('env', ['test_service1', 'test_service2'], 'start')]
|
||||||
assert mock_ssh_call.call_args_list == [
|
assert mock_ssh_call.call_args_list == [
|
||||||
mock.call(['crm', 'resource', 'start', 'test_service1'], node=node),
|
mock.call(['crm', 'resource', 'start', 'test_service1'], node=node),
|
||||||
mock.call(['crm', 'resource', 'start', 'test_service2'], node=node),
|
mock.call(['crm', 'resource', 'start', 'test_service2'], node=node),
|
||||||
|
@ -458,12 +497,61 @@ CRM_XML_SAMPLE = """
|
||||||
</resources>
|
</resources>
|
||||||
"""[1:] # noqa
|
"""[1:] # noqa
|
||||||
CRM_XML_PARSE_RESULT = [
|
CRM_XML_PARSE_RESULT = [
|
||||||
|
'clone_p_dns',
|
||||||
|
'clone_p_haproxy',
|
||||||
'clone_p_heat-engine',
|
'clone_p_heat-engine',
|
||||||
|
'clone_p_mysql',
|
||||||
'clone_p_neutron-dhcp-agent',
|
'clone_p_neutron-dhcp-agent',
|
||||||
'clone_p_neutron-l3-agent',
|
'clone_p_neutron-l3-agent',
|
||||||
'clone_p_neutron-metadata-agent',
|
'clone_p_neutron-metadata-agent',
|
||||||
'clone_p_neutron-plugin-openvswitch-agent',
|
'clone_p_neutron-plugin-openvswitch-agent',
|
||||||
|
'clone_p_ntp',
|
||||||
|
'clone_p_vrouter',
|
||||||
'group__zabbix-server',
|
'group__zabbix-server',
|
||||||
|
'master_p_conntrackd',
|
||||||
|
'master_p_rabbitmq-server',
|
||||||
'p_ceilometer-agent-central',
|
'p_ceilometer-agent-central',
|
||||||
'p_ceilometer-alarm-evaluator',
|
'p_ceilometer-alarm-evaluator',
|
||||||
|
'vip__management',
|
||||||
|
'vip__public',
|
||||||
|
'vip__vrouter',
|
||||||
|
'vip__vrouter_pub'
|
||||||
]
|
]
|
||||||
|
CRM_XML_STATUS_SAMPLE = """
|
||||||
|
<crm_mon version="1.1.12">
|
||||||
|
<resources>
|
||||||
|
<resource id="vip__management_old" resource_agent="ocf::mirantis:ns_IPaddr2" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||||
|
<node name="node-2" id="node-2" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
<resource id="p_ceilometer-alarm-evaluator" resource_agent="ocf::mirantis:ceilometer-alarm-evaluator" role="Started" active="false" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0" />
|
||||||
|
<clone id="master_p_rabbitmq-server" multi_state="true" unique="false" managed="true" failed="false" failure_ignored="false" >
|
||||||
|
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Master" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||||
|
<node name="node-3" id="node-3" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Slave" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||||
|
<node name="node-2" id="node-2" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Stopped" active="false" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0" />
|
||||||
|
</clone>
|
||||||
|
<clone id="clone_p_neutron-metadata-agent" >
|
||||||
|
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
|
||||||
|
<node name="node-3" id="node-3" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
|
||||||
|
<node name="node-2" id="node-2" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
|
||||||
|
<node name="node-5" id="node-5" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
</clone>
|
||||||
|
<group id="group__zabbix-server" number_resources="2" >
|
||||||
|
<resource id="vip__zbx_vip_mgmt" resource_agent="ocf::fuel:ns_IPaddr2" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||||
|
<node name="node-6" id="6" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
<resource id="p_zabbix-server" resource_agent="ocf::fuel:zabbix-server" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||||
|
<node name="node-6" id="6" cached="false"/>
|
||||||
|
</resource>
|
||||||
|
</group>
|
||||||
|
</resources>
|
||||||
|
</crm_mon>
|
||||||
|
"""[1:] # noqa
|
||||||
|
|
|
@ -79,29 +79,93 @@ _default_exclude_services = ('p_mysql', 'p_haproxy', 'p_dns', 'p_ntp', 'vip',
|
||||||
'clone_p_vrouter')
|
'clone_p_vrouter')
|
||||||
|
|
||||||
|
|
||||||
def get_crm_services(status_out, exclude=_default_exclude_services):
|
def get_crm_services(status_out):
|
||||||
data = ElementTree.fromstring(status_out)
|
data = ElementTree.fromstring(status_out)
|
||||||
for resource in data:
|
for resource in data:
|
||||||
name = resource.get('id')
|
yield resource.get('id')
|
||||||
if any(service in name for service in exclude):
|
|
||||||
continue
|
|
||||||
yield name
|
def start_corosync_services(env):
|
||||||
|
manage_corosync_services(env, 'start')
|
||||||
|
|
||||||
|
|
||||||
def stop_corosync_services(env):
|
def stop_corosync_services(env):
|
||||||
|
manage_corosync_services(env, 'stop')
|
||||||
|
|
||||||
|
|
||||||
|
def manage_corosync_services(env, status):
|
||||||
node = env_util.get_one_controller(env)
|
node = env_util.get_one_controller(env)
|
||||||
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
||||||
'resources'], node=node)
|
'resources'], node=node)
|
||||||
for service in get_crm_services(status_out):
|
services_list = []
|
||||||
|
for res in get_crm_services(status_out):
|
||||||
|
if any(service in res for service in _default_exclude_services):
|
||||||
|
continue
|
||||||
|
services_list.append(res)
|
||||||
|
|
||||||
|
for service in services_list:
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
ssh.call(['crm', 'resource', 'stop', service],
|
ssh.call(['crm', 'resource', status, service],
|
||||||
node=node)
|
node=node)
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
pass
|
# Sometimes pacemaker rejects part of requests what it is
|
||||||
|
# not able to process. Sleep was added to mitigate this risk.
|
||||||
|
time.sleep(1)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
time.sleep(60)
|
wait_for_corosync_services_sync(env, services_list, status)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_corosync_services_sync(env, resource_list, status,
|
||||||
|
timeout=1200, check_freq=20):
|
||||||
|
status_bool = status == 'start'
|
||||||
|
node = env_util.get_one_controller(env)
|
||||||
|
started_at = time.time()
|
||||||
|
while True:
|
||||||
|
crm_out = ssh.call_output(['crm_mon', '--as-xml'], node=node)
|
||||||
|
if is_resources_synced(resource_list, crm_out, status_bool):
|
||||||
|
return
|
||||||
|
if time.time() - started_at >= timeout:
|
||||||
|
raise Exception("Timeout waiting for corosync cluster for env %s"
|
||||||
|
" to be synced" % env.id)
|
||||||
|
time.sleep(check_freq)
|
||||||
|
|
||||||
|
|
||||||
|
def is_resources_synced(resources, crm_out, status):
|
||||||
|
def get_resource(resources, resource_id):
|
||||||
|
for resource in resources:
|
||||||
|
if resource.get('id') == resource_id:
|
||||||
|
return resource
|
||||||
|
return None
|
||||||
|
|
||||||
|
data = ElementTree.fromstring(crm_out)
|
||||||
|
mon_resources = data.find('resources')
|
||||||
|
for resource in resources:
|
||||||
|
res = get_resource(mon_resources, resource)
|
||||||
|
if not (is_resource_active(res) is status):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# Resources are fetching from the output of 'crm_mon' command. This command
|
||||||
|
# doesn't return resource if it's not started and we can consider 'absent'
|
||||||
|
# resource as disabled.
|
||||||
|
def is_resource_active(resource):
|
||||||
|
if resource is None:
|
||||||
|
return False
|
||||||
|
if resource.tag == 'resource':
|
||||||
|
return is_primitive_active(resource)
|
||||||
|
for primitive in resource:
|
||||||
|
if not is_primitive_active(primitive):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_primitive_active(resource):
|
||||||
|
if resource.get('active') == 'true':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def stop_upstart_services(env):
|
def stop_upstart_services(env):
|
||||||
|
@ -129,22 +193,6 @@ def stop_upstart_services(env):
|
||||||
ssh.call(['stop', service], node=node)
|
ssh.call(['stop', service], node=node)
|
||||||
|
|
||||||
|
|
||||||
def start_corosync_services(env):
|
|
||||||
node = next(env_util.get_controllers(env))
|
|
||||||
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
|
||||||
'resources'], node=node)
|
|
||||||
for service in get_crm_services(status_out):
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
ssh.call(['crm', 'resource', 'start', service],
|
|
||||||
node=node)
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
time.sleep(60)
|
|
||||||
|
|
||||||
|
|
||||||
def start_upstart_services(env):
|
def start_upstart_services(env):
|
||||||
controllers = list(env_util.get_controllers(env))
|
controllers = list(env_util.get_controllers(env))
|
||||||
for node in controllers:
|
for node in controllers:
|
||||||
|
@ -179,3 +227,15 @@ def start_cluster(env):
|
||||||
for node in controllers:
|
for node in controllers:
|
||||||
for cmd in cmds:
|
for cmd in cmds:
|
||||||
ssh.call(cmd, node=node)
|
ssh.call(cmd, node=node)
|
||||||
|
# When we start cluster we should wait while resources from constant
|
||||||
|
# `_default_exclude_services` become up and running. BTW, We don't touch
|
||||||
|
# these resources in stop/start corosync resources methods at all.
|
||||||
|
node = env_util.get_one_controller(env)
|
||||||
|
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
||||||
|
'resources'], node=node)
|
||||||
|
services_list = []
|
||||||
|
for res in get_crm_services(status_out):
|
||||||
|
if any(service in res for service in _default_exclude_services):
|
||||||
|
services_list.append(res)
|
||||||
|
|
||||||
|
wait_for_corosync_services_sync(env, services_list, 'start')
|
||||||
|
|
Loading…
Reference in New Issue