fuel-ostf/fuel_health/ha_base.py

572 lines
24 KiB
Python

# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from distutils import version
import json
import logging
from lxml import etree
import fuel_health
from fuel_health.common import ssh
from fuel_health.common.utils import data_utils
from fuel_health.test import BaseTestCase
LOG = logging.getLogger(__name__)
class RabbitSanityClass(BaseTestCase):
"""TestClass contains RabbitMQ sanity checks."""
@classmethod
def setUpClass(cls):
cls.config = fuel_health.config.FuelConfig()
cls._controllers = cls.config.compute.online_controllers
cls.nodes = cls.config.compute.nodes
cls._usr = cls.config.compute.controller_node_ssh_user
cls._pwd = cls.config.compute.controller_node_ssh_password
cls._key = cls.config.compute.path_to_private_key
cls._ssh_timeout = cls.config.compute.ssh_timeout
cls._password = None
cls._userid = None
cls.messages = []
cls.queues = []
cls.release_version = \
cls.config.compute.release_version.split('-')[1]
@property
def password(self):
if version.StrictVersion(self.release_version)\
< version.StrictVersion('7.0'):
self._password = self.get_conf_values().strip()
return self._password
if self._password is None:
self._password = self.get_hiera_values(
hiera_hash='rabbit',
hash_key='password'
)
# FIXME(mattmymo): Remove this after merging
# https://review.openstack.org/#/c/276797/
if not self._password:
self._password = self.get_hiera_values(
hiera_hash='rabbit_hash',
hash_key='password'
)
return self._password
@property
def amqp_hosts_name(self):
amqp_hosts_name = {}
if version.StrictVersion(self.release_version)\
< version.StrictVersion('7.0'):
for controller_ip in self._controllers:
amqp_hosts_name[controller_ip] = [controller_ip, '5673']
return amqp_hosts_name
nodes = self.get_hiera_values(hiera_hash='network_metadata',
hash_key='nodes',
json_parse=True)
for ip, port in self.get_amqp_hosts():
for node in nodes:
ips = [nodes[node]['network_roles'][role]
for role in nodes[node]['network_roles']]
if ip in ips:
nailgun_nodes = [n for n in self.nodes
if nodes[node]['name'] == n['hostname']
and n['online']]
if len(nailgun_nodes) == 1:
amqp_hosts_name[nodes[node]['name']] = [ip, port]
return amqp_hosts_name
@property
def userid(self):
if version.StrictVersion(self.release_version)\
< version.StrictVersion('7.0'):
self._userid = 'nova'
return self._userid
if self._userid is None:
self._userid = self.get_hiera_values(
hiera_hash='rabbit',
hash_key='user'
)
# FIXME(mattmymo): Remove this after merging
# https://review.openstack.org/#/c/276797/
if not self._userid:
self._userid = self.get_hiera_values(
hiera_hash='rabbit_hash',
hash_key='user'
)
return self._userid
def get_ssh_connection_to_controller(self, controller):
remote = ssh.Client(host=controller,
username=self._usr,
password=self._pwd,
key_filename=self._key,
timeout=self._ssh_timeout)
return remote
def list_nodes(self):
if not self.amqp_hosts_name:
self.fail('There are no online rabbit nodes')
remote = \
self.get_ssh_connection_to_controller(
self.amqp_hosts_name.keys()[0])
output = remote.exec_command("rabbitmqctl cluster_status")
substring_ind = output.find('{running_nodes')
sub_end_ind = output.find('cluster_name')
result_str = output[substring_ind: sub_end_ind]
num_node = result_str.count("rabbit@")
return num_node
def pick_rabbit_master(self):
if not self.amqp_hosts_name:
self.fail('There are no online rabbit nodes')
remote = \
self.get_ssh_connection_to_controller(
self.amqp_hosts_name.keys()[0])
LOG.info('ssh session to node {0} was open'.format(
self.amqp_hosts_name.keys()[0]))
LOG.info('Try to execute command <crm resource '
'status master_p_rabbitmq-server>')
output = remote.exec_command(
"crm resource status master_p_rabbitmq-server")
LOG.debug('Output is {0}'.format(output))
substring_ind = output.find(
'resource master_p_rabbitmq-server is running on:')
sub_end_ind = output.find('Master')
LOG.debug('Start index is {0} end'
' index is {1}'.format(substring_ind, sub_end_ind))
result_str = output[substring_ind: sub_end_ind]
LOG.debug('Result string is {0}'.format(result_str))
return result_str
def list_channels(self):
if not self.amqp_hosts_name:
self.fail('There are no online rabbit nodes')
remote = \
self.get_ssh_connection_to_controller(
self.amqp_hosts_name.keys()[0])
output = remote.exec_command("rabbitmqctl list_channels")
LOG.debug('Result of executing command rabbitmqctl'
' list_channels is {0}'.format(output))
return output
def get_hiera_values(self, hiera_hash="rabbit",
hash_key=None,
conf_path="/etc/hiera.yaml",
json_parse=False):
if hash_key is not None:
lookup_cmd = ('value = hiera.lookup("{0}", {{}}, '
'{{}}, nil, :hash)["{1}"]').format(hiera_hash,
hash_key)
else:
lookup_cmd = ('value = hiera.lookup("{0}", {{}},'
' {{}}, nil, :hash)').format(hiera_hash)
if json_parse:
print_cmd = 'require "json"; puts JSON.dump(value)'
else:
print_cmd = 'puts value'
cmd = ('ruby -e \'require "hiera"; '
'hiera = Hiera.new(:config => "{0}"); '
'{1}; {2};\'').format(conf_path, lookup_cmd, print_cmd)
LOG.debug("Try to execute cmd {0}".format(cmd))
remote = self.get_ssh_connection_to_controller(self._controllers[0])
try:
res = remote.exec_command(cmd)
LOG.debug("result is {0}".format(res))
if json_parse:
return json.loads(res.strip())
return res.strip()
except Exception:
LOG.exception("Fail to get data from Hiera DB!")
self.fail("Fail to get data from Hiera DB!")
def get_conf_values(self, variable="rabbit_password",
sections="DEFAULT",
conf_path="/etc/nova/nova.conf"):
cmd = ("python -c 'import ConfigParser; "
"cfg=ConfigParser.ConfigParser(); "
"cfg.readfp(open('\"'{0}'\"')); "
"print cfg.get('\"'{1}'\"', '\"'{2}'\"')'")
LOG.debug("Try to execute cmd {0}".format(cmd))
remote = self.get_ssh_connection_to_controller(self._controllers[0])
try:
res = remote.exec_command(cmd.format(
conf_path, sections, variable))
LOG.debug("result is {0}".format(res))
return res
except Exception:
LOG.exception("Fail to get data from config")
self.fail("Fail to get data from config")
def get_amqp_hosts(self):
if not self._controllers:
self.fail('There are no online controllers')
remote = self.get_ssh_connection_to_controller(self._controllers[0])
cmd = 'hiera amqp_hosts'
LOG.debug("Try to execute cmd '{0}' on controller...".format(cmd))
result = remote.exec_command(cmd)
LOG.debug("Result: {0}".format(result))
hosts = result.strip().split(',')
return [host.lstrip().split(':')[0:2] for host in hosts]
def check_rabbit_connections(self):
if not self._controllers:
self.fail('There are no online controllers')
remote = self.get_ssh_connection_to_controller(self._controllers[0])
for key in self.amqp_hosts_name.keys():
ip, port = self.amqp_hosts_name[key]
cmd = ("python -c 'import kombu;"
" c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");"
" c.connect()'".format(ip, self.userid,
self.password, port))
try:
LOG.debug('Checking AMQP host "{0}"...'.format(ip))
remote.exec_command(cmd)
except Exception:
LOG.exception("Failed to establish AMQP connection")
self.fail("Failed to establish AMQP connection to {1}/tcp "
"port on {0} from controller node!".format(ip, port))
def create_queue(self):
if not self._controllers:
self.fail('There are no online controllers')
remote = self.get_ssh_connection_to_controller(self._controllers[0])
for key in self.amqp_hosts_name.keys():
ip, port = self.amqp_hosts_name[key]
test_queue = 'test-rabbit-{0}-{1}'.format(
data_utils.rand_name() + data_utils.generate_uuid(),
ip
)
cmd = ("python -c 'import kombu;"
" c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");"
" c.connect(); ch = c.channel(); q = kombu.Qu"
"eue(\"{4}\", channel=ch, durable=False, queue_arguments={{"
"\"x-expires\": 15 * 60 * 1000}}); q.declare()'".format(
ip, self.userid, self.password, port, test_queue))
try:
LOG.debug("Declaring queue {0} on host {1}".format(
test_queue, ip))
self.queues.append(test_queue)
remote.exec_command(cmd)
except Exception:
LOG.exception("Failed to declare queue on host")
self.fail("Failed to declare queue on host {0}".format(ip))
def publish_message(self):
if not self._controllers:
self.fail('There are no online controllers')
remote = self.get_ssh_connection_to_controller(self._controllers[0])
for key in self.amqp_hosts_name.keys():
ip, port = self.amqp_hosts_name[key]
queues = [q for q in self.queues if ip in q]
if not len(queues) > 0:
self.fail("Can't publish message, queue created on host '{0}' "
"doesn't exist!".format(ip))
test_queue = queues[0]
id = data_utils.generate_uuid()
cmd = ("python -c 'import kombu;"
" c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");"
" c.connect(); ch = c.channel(); producer = "
"kombu.Producer(channel=ch, routing_key=\"{4}\"); "
"producer.publish(\"{5}\")'".format(
ip, self.userid, self.password, port, test_queue, id))
try:
LOG.debug('Try to publish message {0}'.format(id))
remote.exec_command(cmd)
except Exception:
LOG.exception("Failed to publish message!")
self.fail("Failed to publish message!")
self.messages.append({'queue': test_queue, 'id': id})
def check_queue_message_replication(self):
if not self._controllers:
self.fail('There are no online controllers')
remote = self.get_ssh_connection_to_controller(self._controllers[0])
for key in self.amqp_hosts_name.keys():
ip, port = self.amqp_hosts_name[key]
for message in self.messages:
if ip in message['queue']:
continue
cmd = ("python -c 'import kombu;"
" c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");"
" c.connect(); "
"ch = c.channel(); q = kombu.Queue(\"{4}\", channel=ch)"
"; msg = q.get(True); retval = 0 if msg.body in \"{5}\""
" else 1; exit(retval)'".format(
ip, self.userid, self.password, port,
message['queue'], message['id']))
try:
LOG.debug('Checking that message with ID "{0}" was '
'replicated over the cluster...'.format(id))
remote.exec_command(cmd)
except Exception:
LOG.exception('Failed to check message replication!')
self.fail('Failed to check message replication!')
self.messages.remove(message)
break
def delete_queue(self):
if not self._controllers:
self.fail('There are no online controllers')
remote = self.get_ssh_connection_to_controller(self._controllers[0])
LOG.debug('Try to deleting queues {0}... '.format(self.queues))
if not self.queues:
return
host_key = self.amqp_hosts_name.keys()[0]
ip, port = self.amqp_hosts_name[host_key]
for test_queue in self.queues:
cmd = ("python -c 'import kombu;"
" c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");"
" c.connect(); ch = c.channel(); q = kombu.Qu"
"eue(\"{4}\", channel=ch); q.delete();'".format(
ip, self.userid, self.password, port, test_queue))
try:
LOG.debug("Removing queue {0} on host {1}".format(
test_queue, ip))
remote.exec_command(cmd)
self.queues.remove(test_queue)
except Exception:
LOG.exception('Failed to delete queue')
self.fail('Failed to delete queue "{0}"!'.format(test_queue))
class TestPacemakerBase(BaseTestCase):
"""TestPacemakerStatus class base methods."""
@classmethod
def setUpClass(cls):
super(TestPacemakerBase, cls).setUpClass()
cls.config = fuel_health.config.FuelConfig()
cls.controller_names = cls.config.compute.controller_names
cls.online_controller_names = (
cls.config.compute.online_controller_names)
cls.offline_controller_names = list(
set(cls.controller_names) - set(cls.online_controller_names))
cls.online_controller_ips = cls.config.compute.online_controllers
cls.controller_key = cls.config.compute.path_to_private_key
cls.controller_user = cls.config.compute.ssh_user
cls.controllers_pwd = cls.config.compute.controller_node_ssh_password
cls.timeout = cls.config.compute.ssh_timeout
def setUp(self):
super(TestPacemakerBase, self).setUp()
if 'ha' not in self.config.mode:
self.skipTest('Cluster is not HA mode, skipping tests')
if not self.online_controller_names:
self.skipTest('There are no controller nodes')
def _run_ssh_cmd(self, host, cmd):
"""Open SSH session with host and execute command."""
try:
sshclient = ssh.Client(host, self.controller_user,
self.controllers_pwd,
key_filename=self.controller_key,
timeout=self.timeout)
return sshclient.exec_longrun_command(cmd)
except Exception:
LOG.exception("Failed on run ssh cmd")
self.fail("%s command failed." % cmd)
def _register_resource(self, res, res_name, resources):
if res_name not in resources:
resources[res_name] = {
'master': [],
'nodes': [],
'started': 0,
'stopped': 0,
'active': False,
'managed': False,
'failed': False,
}
if 'true' in res.get('active'):
resources[res_name]['active'] = True
if 'true' in res.get('managed'):
resources[res_name]['managed'] = True
if 'true' in res.get('failed'):
resources[res_name]['failed'] = True
res_role = res.get('role')
num_nodes = int(res.get('nodes_running_on'))
if num_nodes:
resources[res_name]['started'] += num_nodes
for rnode in res.iter('node'):
if 'Master' in res_role:
resources[res_name]['master'].append(
rnode.get('name'))
resources[res_name]['nodes'].append(
rnode.get('name'))
else:
resources[res_name]['stopped'] += 1
def get_pcs_resources(self, pcs_status):
"""Get pacemaker resources status to a python dict:
return:
{
str: { # Resource name
'started': int, # count of Master/Started
'stopped': int, # count of Stopped resources
'nodes': [node_name, ...], # All node names where the
# resource is started
'master': [node_name, ...], # Node names for 'Master'
# ('master' is also in 'nodes')
'active': bool, # Is resource active?
'managed': bool, # Is resource managed?
'failed': bool, # Has resource failed?
},
...
}
"""
root = etree.fromstring(pcs_status)
resources = {}
for res_group in root.iter('resources'):
for res in res_group:
res_name = res.get('id')
if 'resource' in res.tag:
self._register_resource(res, res_name, resources)
elif 'clone' in res.tag:
for r in res:
self._register_resource(r, res_name, resources)
elif 'group' in res.tag:
for r in res:
res_name_ingroup = r.get('id')
self._register_resource(r, res_name_ingroup, resources)
self._register_resource(r, res_name, resources)
return resources
def get_pcs_nodes(self, pcs_status):
root = etree.fromstring(pcs_status)
nodes = {'Online': [], 'Offline': []}
for nodes_group in root.iter('nodes'):
for node in nodes_group:
if 'true' in node.get('online'):
nodes['Online'].append(node.get('name'))
else:
nodes['Offline'].append(node.get('name'))
return nodes
def get_pcs_constraints(self, constraints_xml):
"""Parse pacemaker constraints
:param constraints_xml: XML string contains pacemaker constraints
:return dict:
{string: # Resource name,
{'attrs': list, # List of dicts for resource
# attributes on each node,
'enabled': list # List of strings with node names where
# the resource allowed to start,
'with-rsc': string # Name of an another resource
# from which this resource depends on.
}
}
"""
root = etree.fromstring(constraints_xml)
constraints = {}
# 1. Get all attributes from constraints for each resource
for con_group in root.iter('constraints'):
for con in con_group:
if 'rsc_location' in con.tag or 'rsc_colocation' in con.tag:
if 'score' not in con.keys():
# TODO(ddmitriev): process resource dependences
# for 'rule' section
continue
rsc = con.get('rsc')
if rsc not in constraints:
constraints[rsc] = {'attrs': [con.attrib]}
else:
constraints[rsc]['attrs'].append(con.attrib)
# 2. Make list of nodes for each resource where it is allowed to start.
# Remove from 'enabled' list all nodes with score '-INFINITY'
for rsc in constraints:
attrs = constraints[rsc]['attrs']
enabled = []
disabled = []
for attr in attrs:
if 'with-rsc' in attr:
constraints[rsc]['with-rsc'] = attr['with-rsc']
elif 'node' in attr:
if attr['score'] == '-INFINITY':
disabled.append(attr['node'])
else:
enabled.append(attr['node'])
constraints[rsc]['enabled'] = list(set(enabled) - set(disabled))
return constraints
def get_resource_nodes(self, rsc, constraints, cluster_resources,
orig_rsc):
if rsc in orig_rsc:
# Constraints loop detected!
msg = ('There is a dependency loop in constraints configuration: '
'resource "{0}" depends on the resource "{1}". Please check'
' the pacemaker configuration!'
.format(orig_rsc[-1], rsc))
raise fuel_health.exceptions.InvalidConfiguration(msg)
else:
orig_rsc.append(rsc)
# Nodes where the resource is allowed to start
allowed = constraints[rsc]['enabled']
# Nodes where the parent resource is actually started
if rsc in cluster_resources:
started = cluster_resources[rsc]['nodes']
else:
started = []
if 'with-rsc' in constraints[rsc]:
# Recursively get nodes for the parent resource
(parent_allowed,
parent_started,
parent_disallowed) = self.get_resource_nodes(
constraints[rsc]['with-rsc'],
constraints,
cluster_resources,
orig_rsc)
if 'score' in constraints[rsc]:
if constraints[rsc]['score'] == '-INFINITY':
# If resource banned to start on the same nodes where
# parent resource is started, then nodes where parent
# resource is started should be removed from 'allowed'
allowed = (set(allowed) - set(parent_started))
else:
# Reduce 'allowed' list to only those nodes where
# the parent resource is allowed and running
allowed = list(set(parent_started) &
set(parent_allowed) &
set(allowed))
# List of nodes, where resource is started, but not allowed to start
disallowed = list(set(started) - set(allowed))
return allowed, started, disallowed