Add Rally Instance HA test suite

This commit adds a Rally test suite which will make the user able to
check if Instance HA is behaving correctly.
It could be used to add further tests in the future, maybe to replace
the actual validate-ha way to check cluster related things.

Change-Id: Ic9161a5f75fc33f4f61ab8549b18b43f7168197c
This commit is contained in:
Raoul Scarazzini 2017-09-13 08:53:35 -04:00
parent 9eb8afa547
commit fad795e7cd
4 changed files with 667 additions and 0 deletions

53
rally/README.md Normal file
View File

@ -0,0 +1,53 @@
Rally tests
===========
This directory contains all the files available to use Rally for testing the
behavior of the TripleO environment.
For example you can test if instance HA is behaving correctly inside the
overcloud environment in which it was configured.
Requirements
------------
A working and accessible TripleO environment, as described [here](https://github.com/redhat-openstack/tripleo-quickstart-utils/tree/master/README.md).
so an *hosts* file containing the whole environment inventory and, if needed, a
*ssh.config.ansible* with all the information to access nodes.
How to use Rally to test Instance HA
------------------------------------
If you want to launch a Rally test session to check how Instance HA is behaving
into the overcloud you can rely on a command like this one:
ansible-playbook -i hosts \
-e public_physical_network="public" \
-e floating_ip_cidr="192.168.99.0/24" \
-e public_net_pool_start="192.168.99.211" \
-e public_net_pool_end="192.168.99.216" \
-e public_net_gateway="192.168.99.254" \
tripleo-quickstart-utils/rally/instance-ha.yml
this command can be launched from the *undercloud* machine or from a jump host
(which must have all the required file locally).
The requested parameters refers to the network settings in which the instances
will be spawned into.
This will execute the tests contained in the template yaml:
* *InstanceHA.recover_instance_fip_and_volume*: spawn an instance, stop the
compute it's running on, check it migrates, check node recovers;
* *InstanceHA.recover_stopped_instance_fip*: spawn an instance, put it in
stopped status, stop the compute it's running on, check it migrates, check
node recovers;
* *InstanceHA.recover_instance_two_cycles*: do as in the first step, but two
times;
License
-------
GPL
Author Information
------------------
Raoul Scarazzini <rasca@redhat.com>

99
rally/instance-ha.yml Normal file
View File

@ -0,0 +1,99 @@
---
- hosts: undercloud
gather_facts: no
become: yes
become_method: sudo
tasks:
- name: Install Rally dependencies
shell: |
# Python pip
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py
python get-pip.py
# Depndencies
yum install -y gmp-devel libffi-devel libxml2-devel libxslt-devel openssl-devel postgresql-devel gcc python-devel
- hosts: undercloud
gather_facts: no
tasks:
- name: Install Rally
shell: |
# Install Rally from upstream
wget -q -O- https://raw.githubusercontent.com/openstack/rally/master/install_rally.sh | bash |& tee rally-install.log
mkdir -p .rally/plugins
- name: Check Rally installation
shell: |
source /home/stack/rally/bin/activate
rally --version
- hosts: undercloud
gather_facts: no
tasks:
- name: Copy instance-ha Rally plugin to remote rally directory
copy:
src: plugins/instanceha.py
dest: .rally/plugins
- hosts: undercloud
gather_facts: no
tasks:
- name: Install Rally environment and create deployment
shell: |
source /home/stack/overcloudrc
source /home/stack/rally/bin/activate
export OS_INSECURE=True
rally deployment create --fromenv --name overcloud |& tee rally-instance-ha-deployment-create.log
rally deployment use overcloud
- hosts: undercloud
gather_facts: no
tasks:
- name: Prepare overcloud env
shell: |
source /home/stack/overcloudrc
projectid=$(openstack project list | awk '/admin/ {print $2}')
wget -O /tmp/cirros-0.3.4-x86_64-disk.img http://download.cirros-cloud.net/0.3.4/cirros-0.3.4-x86_64-disk.img
glance --os-project-id=$projectid image-create --name cirros --container-format bare --disk-format raw --file /tmp/cirros-0.3.4-x86_64-disk.img --visibility public
nova flavor-create --ephemeral 0 --is-public True m1.tiny overcloud-instance-test-small-flavor 2048 20 1
neutron net-create {{ public_physical_network }}-network --router:external=True --provider:physical_network {{ public_physical_network }} --provider:network_type flat
neutron subnet-create --name {{ public_physical_network }}-subnet --disable-dhcp --allocation-pool start={{ public_net_pool_start }},end={{ public_net_pool_end }} --gateway {{ public_net_gateway }} {{ public_physical_network }}-network {{ floating_ip_cidr }}
- hosts: undercloud
gather_facts: no
tasks:
- name: Copy Rally task file
template:
src: templates/instance-ha.yaml.j2
dest: "/home/stack/instance-ha.yaml"
mode: 0666
- name: Start Rally task
shell: |
source /home/stack/rally/bin/activate
rally task start --task /home/stack/instance-ha.yaml --deployment overcloud |& tee rally-instance-ha-run.log
- name: Create Report JUnit
shell: |
source /home/stack/rally/bin/activate
rally task report --junit --out /home/stack/nosetests.xml |& tee rally-instance-ha-report.log
- fetch:
src: "/home/stack/nosetests.xml"
dest: "{{ lookup('env', 'PWD') }}/nosetests.xml"
flat: yes
- hosts: undercloud
gather_facts: no
tasks:
- name: Remove overcloud env
shell: |
source /home/stack/overcloudrc
projectid=$(openstack project list | awk '/admin/ {print $2}')
glance --os-project-id=$projectid image-delete $(glance --os-project-id=$projectid image-list | awk '/cirros/ {print $2}')
nova flavor-delete overcloud-instance-test-small-flavor
neutron net-delete {{ public_physical_network }}-network

434
rally/plugins/instanceha.py Normal file
View File

@ -0,0 +1,434 @@
import six
import time
from os import path
from rally.common import logging
from rally.common import sshutils
from rally import consts
from rally import exceptions
from rally.plugins.openstack import scenario
from rally.plugins.openstack.scenarios.vm import utils as vm_utils
from rally.plugins.openstack.scenarios.cinder import utils as cinder_utils
from rally.task import atomic
from rally.task import types
from rally.task import validation
from rally.task import utils as task_utils
LOG = logging.getLogger(__name__)
def failover(self, host, command, port=22, username="", password="",
key_filename=None, pkey=None):
"""Trigger failover at host
:param host:
:param command:
:return:
"""
if key_filename:
key_filename = path.expanduser(key_filename)
LOG.info("Host: %s. Injecting Failover %s" % (host,
command))
try:
code, out, err = _run_command(self, server_ip=host, port=port,
username=username,
password=password,
key_filename=key_filename,
pkey=pkey, command=command
)
if code and code > 0:
raise exceptions.ScriptError(
"Error running command %(command)s. "
"Error %(code)s: %(error)s" % {
"command": command, "code": code, "error": err})
except exceptions.SSHTimeout:
LOG.debug("SSH session of disruptor command timeouted, continue...")
pass
def _run_command(self, server_ip, port, username, password, command,
pkey=None, key_filename=None):
"""Run command via SSH on server.
Create SSH connection for server, wait for server to become available
(there is a delay between server being set to ACTIVE and sshd being
available). Then call run_command_over_ssh to actually execute the
command.
Note: Shadows vm.utils.VMScenario._run_command to support key_filename.
:param server_ip: server ip address
:param port: ssh port for SSH connection
:param username: str. ssh username for server
:param password: Password for SSH authentication
:param command: Dictionary specifying command to execute.
See `rally info find VMTasks.boot_runcommand_delete' parameter
`command' docstring for explanation.
:param key_filename: private key filename for SSH authentication
:param pkey: key for SSH authentication
:returns: tuple (exit_status, stdout, stderr)
"""
if not key_filename:
pkey = pkey or self.context["user"]["keypair"]["private"]
ssh = sshutils.SSH(username, server_ip, port=port,
pkey=pkey, password=password,
key_filename=key_filename)
self._wait_for_ssh(ssh)
return _run_command_over_ssh(self, ssh, command)
@atomic.action_timer("vm.run_command_over_ssh")
def _run_command_over_ssh(self, ssh, command):
"""Run command inside an instance.
This is a separate function so that only script execution is timed.
:param ssh: A SSHClient instance.
:param command: Dictionary specifying command to execute.
See `rally info find VMTasks.boot_runcommand_delete' parameter
`command' docstring for explanation.
:returns: tuple (exit_status, stdout, stderr)
"""
cmd, stdin = [], None
interpreter = command.get("interpreter") or []
if interpreter:
if isinstance(interpreter, six.string_types):
interpreter = [interpreter]
elif type(interpreter) != list:
raise ValueError("command 'interpreter' value must be str "
"or list type")
cmd.extend(interpreter)
remote_path = command.get("remote_path") or []
if remote_path:
if isinstance(remote_path, six.string_types):
remote_path = [remote_path]
elif type(remote_path) != list:
raise ValueError("command 'remote_path' value must be str "
"or list type")
cmd.extend(remote_path)
if command.get("local_path"):
ssh.put_file(os.path.expanduser(
command["local_path"]), remote_path[-1],
mode=self.USER_RWX_OTHERS_RX_ACCESS_MODE)
if command.get("script_file"):
stdin = open(os.path.expanduser(command["script_file"]), "rb")
elif command.get("script_inline"):
stdin = six.moves.StringIO(command["script_inline"])
cmd.extend(command.get("command_args") or [])
return ssh.execute(cmd, stdin=stdin, timeout=10)
def one_killing_iteration(self, server, fip, computes, disruptor_cmd,
stop_instance):
"""Find the host where instance is hosted, disrupt the host and
verify status of the instance after the failover"""
server_admin = self.admin_clients("nova").servers.get(server.id)
host_name_pre = getattr(server_admin, "OS-EXT-SRV-ATTR:host")
hypervisors = self.admin_clients("nova").hypervisors.list()
hostnames = []
for hypervisor in hypervisors:
hostnames.append(getattr(hypervisor, "hypervisor_hostname"))
if getattr(hypervisor, "hypervisor_hostname") == host_name_pre:
hypervisor_id = getattr(hypervisor, "id")
hypervisor = self.admin_clients("nova").hypervisors.get(hypervisor_id)
hypervisor_ip = getattr(hypervisor, "host_ip")
if not disruptor_cmd:
disruptor_cmd = {
"script_inline": "sudo sh -c \"echo b > /proc/sysrq-trigger\"",
"interpreter": "/bin/sh"
}
# Trigger failover of compute node hosting the instance
failover(self, host=hypervisor_ip,
command=disruptor_cmd,
port=computes.get("port", 22),
username=computes.get("username"),
password=computes.get("password"),
key_filename=computes.get("key_filename"),
pkey=computes.get("pkey")
)
# Wait for instance to be moved to different host
hostnames.remove(host_name_pre)
task_utils.wait_for(
server_admin,
status_attr="OS-EXT-SRV-ATTR:host",
ready_statuses=hostnames,
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=5
)
# Check the instance is SHUTOFF in the case of stopped instance or
# that the instance is pingable
if stop_instance:
task_utils.wait_for(
server,
ready_statuses=["SHUTOFF"],
update_resource=task_utils.get_from_manager(),
timeout=60,
check_interval=2
)
#server_admin = self.admin_clients("nova").servers.get(server.id)
#host_name_post = getattr(server_admin, "OS-EXT-SRV-ATTR:host")
#if host_name_post in host_name_pre:
#raise exceptions.InvalidHostException()
else:
try:
if self.wait_for_ping:
self._wait_for_ping(fip["ip"])
except exceptions.TimeoutException:
console_logs = self._get_server_console_output(server,
None)
LOG.debug("VM console logs:\n%s", console_logs)
raise
def recover_instance_ha(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
stop_instance=False,
disruptor_cmd=None,
iterations=1,
wait_for_ping=True,
max_log_length=None,
**kwargs):
"""Boot a server, trigger failover of host and verify instance.
:param image: glance image name to use for the vm
:param flavor: VM flavor name
:param computes: dictionary with credentials to the compute nodes
consisting of username, password, port, key_filename, disruptor
command and pkey.
Examples::
computes: {
username: heat-admin,
key_filename: /path/to/ssh/id_rsa.pub
port: 22
}
:param volume_args: volume args for booting server from volume
:param floating_network: external network name, for floating ip
:param use_floating_ip: bool, floating or fixed IP for SSH connection
:param force_delete: whether to use force_delete for servers
:param stop_instance: whether to stop instance before disruptor command
:param disruptor_cmd: command to be send to hosting compute node
:param iterations: number of compute node killing iteration
:param wait_for_ping: whether to check connectivity on server creation
:param **kwargs: extra arguments for booting the server
:param max_log_length: The number of tail nova console-log lines user
would like to retrieve
:returns:
"""
self.wait_for_ping = wait_for_ping
if volume_args:
volume = self.cinder.create_volume(volume_args["size"], imageRef=None)
kwargs["block_device_mapping"] = {"vdrally": "%s:::1" % volume.id}
server, fip = self._boot_server_with_fip(
image, flavor, use_floating_ip=use_floating_ip,
floating_network=floating_network,
key_name=self.context["user"]["keypair"]["name"],
**kwargs)
task_utils.wait_for(
server,
ready_statuses=["ACTIVE"],
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=2
)
try:
if self.wait_for_ping:
self._wait_for_ping(fip["ip"])
except exceptions.TimeoutException:
console_logs = self._get_server_console_output(server,
max_log_length)
LOG.debug("VM console logs:\n%s", console_logs)
raise
if stop_instance:
self._stop_server(server)
task_utils.wait_for(
server,
ready_statuses=["SHUTOFF"],
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=2
)
# Wait a little before killing the compute
# If we do not wait, backing image will get corrupted which was reported as bug
time.sleep(30)
for iteration in range(1, iterations+1):
one_killing_iteration(self, server, fip, computes,
disruptor_cmd, stop_instance)
# Give cluster some time to recover original compute node
LOG.info("Wait for compute nodes to come online after previous disruption")
time.sleep(360)
if stop_instance:
# Start instance If It was stopped.
self._start_server(server)
task_utils.wait_for(
server,
ready_statuses=["ACTIVE"],
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=2
)
self._delete_server_with_fip(server, fip, force_delete=force_delete)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.image_valid_on_flavor("flavor", "image")
@validation.valid_command("command", required=False)
@validation.number("port", minval=1, maxval=65535, nullable=True,
integer_only=True)
@validation.external_network_exists("floating_network")
@validation.required_services(consts.Service.NOVA, consts.Service.CINDER)
@validation.required_openstack(users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_instance_fip_and_volume",
platform="openstack")
class InstanceHARecoverFIPAndVolume(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverFIPAndVolume, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.image_valid_on_flavor("flavor", "image")
@validation.valid_command("command", required=False)
@validation.number("port", minval=1, maxval=65535, nullable=True,
integer_only=True)
@validation.external_network_exists("floating_network")
@validation.required_services(consts.Service.NOVA, consts.Service.CINDER)
@validation.required_openstack(users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_instance_two_cycles",
platform="openstack")
class InstanceHARecoverTwoCycle(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverTwoCycle, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
iterations=2,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.image_valid_on_flavor("flavor", "image")
@validation.valid_command("command", required=False)
@validation.number("port", minval=1, maxval=65535, nullable=True,
integer_only=True)
@validation.external_network_exists("floating_network")
@validation.required_services(consts.Service.NOVA, consts.Service.CINDER)
@validation.required_openstack(users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_stopped_instance_fip",
platform="openstack")
class InstanceHARecoverStopped(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverStopped, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
stop_instance=True,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.image_valid_on_flavor("flavor", "image")
@validation.valid_command("command", required=False)
@validation.number("port", minval=1, maxval=65535, nullable=True,
integer_only=True)
@validation.external_network_exists("floating_network")
@validation.required_services(consts.Service.NOVA, consts.Service.CINDER)
@validation.required_openstack(users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_instance_nova_compute",
platform="openstack")
class InstanceHARecoverNovaCompute(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverNovaCompute, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
disruptor_cmd = {
"script_inline": "sudo kill -9 $(ps -ef | grep ^nova* | awk \'{print$2}\'); echo {}",
"interpreter": "/bin/sh"
}
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
disruptor_cmd=disruptor_cmd,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)

View File

@ -0,0 +1,81 @@
---
InstanceHA.recover_instance_fip_and_volume:
-
args:
flavor:
name: "m1.tiny"
image:
name: cirros
volume_args:
size: 1
floating_network: "{{ public_physical_network }}-network"
force_delete: false
wait_for_ping: false
computes:
username: "heat-admin"
key_filename: "/home/stack/.ssh/id_rsa"
port: 22
runner:
type: "constant"
times: 1
concurrency: 1
context:
users:
tenants: 2
users_per_tenant: 1
network: {}
sla:
failure_rate:
max: 0.0
InstanceHA.recover_stopped_instance_fip:
-
args:
flavor:
name: "m1.tiny"
image:
name: cirros
floating_network: "{{ public_physical_network }}-network"
force_delete: false
wait_for_ping: false
computes:
username: "heat-admin"
key_filename: "/home/stack/.ssh/id_rsa"
port: 22
runner:
type: "constant"
times: 1
concurrency: 1
context:
users:
tenants: 2
users_per_tenant: 1
network: {}
sla:
failure_rate:
max: 0.0
InstanceHA.recover_instance_two_cycles:
-
args:
flavor:
name: "m1.tiny"
image:
name: cirros
floating_network: "{{ public_physical_network }}-network"
force_delete: false
wait_for_ping: false
computes:
username: "heat-admin"
key_filename: "/home/stack/.ssh/id_rsa"
port: 22
runner:
type: "constant"
times: 1
concurrency: 1
context:
users:
tenants: 2
users_per_tenant: 1
network: {}
sla:
failure_rate:
max: 0.0