tobiko/tobiko/tripleo/pacemaker.py

456 lines
16 KiB
Python

from __future__ import absolute_import
import enum
import io
import time
import typing
from oslo_log import log
import pandas
import tobiko
from tobiko import config
from tobiko.tripleo import overcloud
from tobiko.shell import sh
from tobiko.shell import ssh
from tobiko.openstack import topology
CONF = config.CONF
LOG = log.getLogger(__name__)
GALERA_RESOURCE = "galera-bundle"
HAPROXY_RESOURCE = "haproxy-bundle"
OVN_DBS_RESOURCE = "ovn-dbs-bundle"
class PcsResourceException(tobiko.TobikoException):
message = "pcs cluster is not in a healthy state"
def get_pcs_resources_table(timeout=720, interval=2) -> pandas.DataFrame:
"""
get pcs status from a controller and parse it
to have it's resources states in check
returns :
rabbitmq-bundle-0 (ocf::heartbeat:rabbitmq-cluster): Started con
troller-0
ip-10.0.0.101 (ocf::heartbeat:IPaddr2): Started controller-1
openstack-cinder-volume-docker-0 (ocf::heartbeat:docker): Sta
rted controller-0
:return: dataframe of pcs resources stats table
"""
failures: typing.List[str] = []
start = time.time()
# prevent pcs table read failure while pacemaker is starting
while time.time() - start < timeout:
failures = []
try:
output = run_pcs_status(options=['resources'], grep_str='ocf')
# remove the first column when it only includes '*' characters
output = output.replace('*', '').strip()
stream = io.StringIO(output)
table: pandas.DataFrame = pandas.read_csv(
stream, delim_whitespace=True, header=None)
table.columns = ['resource', 'resource_type', 'resource_state',
'overcloud_node']
except ValueError:
pcs_status_raw = run_pcs_status()
failures.append(f'pcs status table import failed : '
f'pcs status stdout:\n {pcs_status_raw}')
LOG.info('Retrying , timeout at: {}'
.format(timeout-(time.time() - start)))
time.sleep(interval)
else:
break
# exhausted all retries
if failures:
tobiko.fail('pcs status table import error\n' + '\n'.join(failures))
LOG.debug("Got pcs status :\n%s", table)
return table
def get_pcs_prefix_and_status_values():
if topology.verify_osp_version('17.0', lower=True):
ocf_prefix = "ocf::"
promoted_status_str = "Master"
unpromoted_status_str = "Slave"
else:
ocf_prefix = "ocf:"
promoted_status_str = "Promoted"
unpromoted_status_str = "Unpromoted"
return ocf_prefix, promoted_status_str, unpromoted_status_str
class PacemakerResourcesStatus(object):
"""
class to handle pcs resources checks
"""
def __init__(self):
self.pcs_df = get_pcs_resources_table()
(self.ocf_prefix,
self.promoted_status_str,
self.unpromoted_status_str) = get_pcs_prefix_and_status_values()
def container_runtime(self):
if not self.pcs_df[(self.pcs_df['resource_type'] ==
f"({self.ocf_prefix}heartbeat:docker):")].empty:
return 'docker'
if not self.pcs_df[(self.pcs_df['resource_type'] ==
f"({self.ocf_prefix}heartbeat:podman):")].empty:
return 'podman'
def resource_count(self, resource_type):
return self.pcs_df[(self.pcs_df['resource_type'] == resource_type)][
'resource_state'].count()
def resource_count_in_state(self, resource_type, resource_state):
return self.pcs_df[(self.pcs_df['resource_type'] ==
resource_type) & (self.pcs_df['resource_state'] ==
resource_state)][
'resource_state'].count()
def rabbitmq_resource_healthy(self):
rabbitmq_resource_str = \
f"({self.ocf_prefix}heartbeat:rabbitmq-cluster):"
nodes_num = self.resource_count(rabbitmq_resource_str)
started_num = self.resource_count_in_state(
rabbitmq_resource_str, "Started")
if nodes_num == started_num and nodes_num > 0:
LOG.info("pcs status check: resource rabbitmq is in healthy state")
return True
else:
LOG.info("pcs status check: resource rabbitmq not in healthy "
"state")
return False
def galera_resource_healthy(self):
galera_resource_str = f"({self.ocf_prefix}heartbeat:galera):"
nodes_num = self.resource_count(galera_resource_str)
master_num = self.resource_count_in_state(
galera_resource_str, self.promoted_status_str)
if nodes_num == master_num and nodes_num > 0:
LOG.info("pcs status check: resource galera is in healthy state")
return True
else:
LOG.info("pcs status check: resource galera not in healthy state")
return False
def redis_resource_healthy(self):
redis_resource_str = f"({self.ocf_prefix}heartbeat:redis):"
if not overcloud.is_redis_expected():
LOG.info("redis resource not expected on OSP 17 "
"and later releases by default")
return self.pcs_df.query(
f'resource_type == "{redis_resource_str}"').empty
nodes_num = self.resource_count(redis_resource_str)
master_num = self.resource_count_in_state(
redis_resource_str, self.promoted_status_str)
slave_num = self.resource_count_in_state(
redis_resource_str, self.unpromoted_status_str)
if (master_num == 1) and (slave_num == nodes_num - master_num):
LOG.info("pcs status check: resource redis is in healthy state")
return True
else:
LOG.info("pcs status check: resource redis not in healthy state")
return False
def vips_resource_healthy(self):
if CONF.tobiko.rhosp.has_external_load_balancer:
LOG.info("external load balancer used - "
"we can skip vips_resource sanity")
return True
else:
vips_resource_str = f"({self.ocf_prefix}heartbeat:IPaddr2):"
nodes_num = self.resource_count(vips_resource_str)
started_num = self.resource_count_in_state(
vips_resource_str, "Started")
if nodes_num == started_num and nodes_num > 0:
LOG.info("pcs status check: resources vips are "
"in healthy state")
return True
else:
LOG.info(
"pcs status check: resources"
" vips are not in healthy state")
return False
def ha_proxy_cinder_healthy(self):
if CONF.tobiko.rhosp.has_external_load_balancer:
LOG.info("external load balancer used "
"- we can skip ha_proxy_resource sanity")
return True
else:
ha_proxy_resource_str = (f"({self.ocf_prefix}heartbeat:"
f"{self.container_runtime()}):")
nodes_num = self.resource_count(ha_proxy_resource_str)
started_num = self.resource_count_in_state(
ha_proxy_resource_str, "Started")
if nodes_num == started_num and nodes_num > 0:
LOG.info("pcs status check: resources ha_proxy and"
" cinder are in healthy state")
return True
else:
LOG.info(
"pcs status check: resources ha_proxy and cinder "
"are not in healthy state")
return False
def ovn_resource_healthy(self):
ovn_resource_str = f"({self.ocf_prefix}ovn:ovndb-servers):"
if self.pcs_df.query(
f'resource_type == "{ovn_resource_str}"').empty:
LOG.info('pcs status check: ovn is not deployed, skipping ovn '
'resource check')
return True
nodes_num = self.resource_count(ovn_resource_str)
master_num = self.resource_count_in_state(
ovn_resource_str, self.promoted_status_str)
slave_num = self.resource_count_in_state(
ovn_resource_str, self.unpromoted_status_str)
if (master_num == 1) and (slave_num == nodes_num - master_num):
LOG.info(
"pcs status check: resource ovn is in healthy state")
return True
else:
LOG.info(
"pcs status check: resource ovn is in not in "
"healthy state")
return False
@property
def all_healthy(self):
"""
check if each resource is in healthy order
and return a global healthy status
:return: Bool
"""
for attempt_number in range(360):
try:
if all([
self.rabbitmq_resource_healthy(),
self.galera_resource_healthy(),
self.redis_resource_healthy(),
self.vips_resource_healthy(),
self.ha_proxy_cinder_healthy(),
self.ovn_resource_healthy()
]):
LOG.info("pcs status checks: all resources are"
" in healthy state")
return True
else:
LOG.info("pcs status check: not all resources are "
"in healthy "
"state")
raise PcsResourceException()
except PcsResourceException:
# reread pcs status
LOG.info('Retrying pacemaker resource checks attempt '
'{} of 360'.format(attempt_number))
time.sleep(1)
self.pcs_df = get_pcs_resources_table()
# exhausted all retries
tobiko.fail('pcs cluster is not in a healthy state')
def get_overcloud_nodes_running_pcs_resource(resource=None,
resource_type=None,
resource_state=None):
"""
Check what nodes are running the specified resource/type/state
resource/type/state: exact str of a resource name as seen in pcs status
:return: list of overcloud nodes
"""
# pylint: disable=no-member
pcs_df = get_pcs_resources_table()
if resource:
pcs_df_query_resource = pcs_df.query('resource=="{}"'.format(
resource))
return pcs_df_query_resource['overcloud_node'].unique().tolist()
if resource_type and resource_state:
pcs_df_query_resource_type_state = pcs_df.query(
'resource_type=="{}" and resource_state=="{}"'.format(
resource_type, resource_state))
return pcs_df_query_resource_type_state[
'overcloud_node'].unique().tolist()
if resource_type and not resource_state:
pcs_df_query_resource_type = pcs_df.query(
'resource_type=="{}"'.format(resource_type))
return pcs_df_query_resource_type['overcloud_node'].unique().tolist()
def get_resource_master_node(resource_type=None):
get_overcloud_nodes_running_pcs_resource(
resource_type=resource_type, resource_state='Master')
def get_ovn_db_master_node():
ocf_prefix, promoted_status_str, _ = get_pcs_prefix_and_status_values()
return get_overcloud_nodes_running_pcs_resource(
resource_type=f'({ocf_prefix}ovn:ovndb-servers):',
resource_state=promoted_status_str)
def get_overcloud_resource(resource_type=None,
resource_state=None):
"""
Check what nodes are running the specified resource/type/state
resource/type/state: exact str of a resource name as seen in pcs status
:return: list of overcloud nodes
"""
pcs_df = get_pcs_resources_table()
if resource_type and resource_state:
pcs_df_query_resource_type_state = pcs_df.query(
'resource_type=="{}" and resource_state=="{}"'.format(
resource_type, resource_state))
return pcs_df_query_resource_type_state[
'resource'].unique().tolist()
if resource_type and not resource_state:
# pylint: disable=no-member
pcs_df_query_resource_type = pcs_df.query(
'resource_type=="{}"'.format(resource_type))
return pcs_df_query_resource_type['resource'].unique().tolist()
def instanceha_deployed():
"""check IHA deployment
checks for existence of the nova-evacuate resource"""
if overcloud.has_overcloud():
return get_overcloud_nodes_running_pcs_resource(
resource='nova-evacuate')
else:
return False
skip_if_instanceha_not_delpoyed = tobiko.skip_unless(
'instanceha not delpoyed', instanceha_deployed)
def fencing_deployed():
"""check fencing deployment
checks for existence of the stonith-fence type resources"""
fencing_output = run_pcs_status(grep_str="stonith:fence_ipmilan")
if fencing_output:
return True
else:
return False
skip_if_fencing_not_deployed = tobiko.skip_unless(
'fencing not delpoyed', fencing_deployed)
def run_pcs_status(ssh_client: ssh.SSHClientFixture = None,
options: list = None,
grep_str: str = None) -> str:
command_args = ['status']
command_args += options or []
output = execute_pcs(command_args,
ssh_client=ssh_client,
sudo=True)
if not grep_str:
return output
output_ocf_lines = []
for line in output.splitlines():
if grep_str in line:
output_ocf_lines.append(line)
return '\n'.join(output_ocf_lines)
class PcsResourceOperation(enum.Enum):
DISABLE = "disable"
ENABLE = "enable"
RESTART = "restart"
SHOW = "show"
def __init__(self, pcsoperation: str):
self.pcsoperation = pcsoperation
DISABLE = PcsResourceOperation.DISABLE
ENABLE = PcsResourceOperation.ENABLE
RESTART = PcsResourceOperation.RESTART
SHOW = PcsResourceOperation.SHOW
def run_pcs_resource_operation(resource: str,
operation: PcsResourceOperation,
ssh_client: ssh.SSHClientFixture = None,
node: str = None,
operation_wait: int = 60,
retry_timeout: float = 180.,
retry_interval: float = 5.) -> str:
tobiko.check_valid_type(operation, PcsResourceOperation)
command_args = ['resource', operation.pcsoperation, resource]
if node is not None:
command_args.append(node)
command_args.append(f'--wait={operation_wait}')
# add stderr to the output if the operation is disable or enable
add_stderr = operation in (DISABLE, ENABLE)
# execute the command with retries
for attempt in tobiko.retry(timeout=retry_timeout,
interval=retry_interval):
try:
output = execute_pcs(command_args,
ssh_client=ssh_client,
add_stderr=add_stderr,
timeout=operation_wait + 10.,
sudo=True)
except sh.ShellCommandFailed as exc:
if attempt.is_last:
raise exc
else:
LOG.info('the pcs command failed - retrying...')
continue
break
return output
PCS_COMMAND = sh.shell_command(['pcs'])
def execute_pcs(command_args: list,
ssh_client: ssh.SSHClientFixture = None,
pcs_command: sh.ShellCommand = None,
add_stderr: bool = False,
timeout: float = 40.,
**execute_params) -> str:
if ssh_client is None:
ssh_client = topology.find_openstack_node(
group='controller').ssh_client
if pcs_command:
pcs_command = sh.shell_command(pcs_command)
else:
pcs_command = PCS_COMMAND
command = pcs_command + command_args
result = sh.execute(
command, ssh_client=ssh_client, stdin=False, stdout=True, stderr=True,
timeout=timeout, **execute_params)
if add_stderr:
output = '\n'.join([result.stdout, result.stderr])
else:
output = result.stdout
return output