from __future__ import absolute_import import enum import io import time import typing from oslo_log import log import pandas import tobiko from tobiko import config from tobiko.tripleo import overcloud from tobiko.shell import sh from tobiko.shell import ssh from tobiko.openstack import topology CONF = config.CONF LOG = log.getLogger(__name__) GALERA_RESOURCE = "galera-bundle" HAPROXY_RESOURCE = "haproxy-bundle" OVN_DBS_RESOURCE = "ovn-dbs-bundle" class PcsResourceException(tobiko.TobikoException): message = "pcs cluster is not in a healthy state" def get_pcs_resources_table(timeout=720, interval=2) -> pandas.DataFrame: """ get pcs status from a controller and parse it to have it's resources states in check returns : rabbitmq-bundle-0 (ocf::heartbeat:rabbitmq-cluster): Started con troller-0 ip-10.0.0.101 (ocf::heartbeat:IPaddr2): Started controller-1 openstack-cinder-volume-docker-0 (ocf::heartbeat:docker): Sta rted controller-0 :return: dataframe of pcs resources stats table """ failures: typing.List[str] = [] start = time.time() # prevent pcs table read failure while pacemaker is starting while time.time() - start < timeout: failures = [] try: output = run_pcs_status(options=['resources'], grep_str='ocf') # remove the first column when it only includes '*' characters output = output.replace('*', '').strip() stream = io.StringIO(output) table: pandas.DataFrame = pandas.read_csv( stream, delim_whitespace=True, header=None) table.columns = ['resource', 'resource_type', 'resource_state', 'overcloud_node'] except ValueError: pcs_status_raw = run_pcs_status() failures.append(f'pcs status table import failed : ' f'pcs status stdout:\n {pcs_status_raw}') LOG.info('Retrying , timeout at: {}' .format(timeout-(time.time() - start))) time.sleep(interval) else: break # exhausted all retries if failures: tobiko.fail('pcs status table import error\n' + '\n'.join(failures)) LOG.debug("Got pcs status :\n%s", table) return table def get_pcs_prefix_and_status_values(): if topology.verify_osp_version('17.0', lower=True): ocf_prefix = "ocf::" promoted_status_str = "Master" unpromoted_status_str = "Slave" else: ocf_prefix = "ocf:" promoted_status_str = "Promoted" unpromoted_status_str = "Unpromoted" return ocf_prefix, promoted_status_str, unpromoted_status_str class PacemakerResourcesStatus(object): """ class to handle pcs resources checks """ def __init__(self): self.pcs_df = get_pcs_resources_table() (self.ocf_prefix, self.promoted_status_str, self.unpromoted_status_str) = get_pcs_prefix_and_status_values() def container_runtime(self): if not self.pcs_df[(self.pcs_df['resource_type'] == f"({self.ocf_prefix}heartbeat:docker):")].empty: return 'docker' if not self.pcs_df[(self.pcs_df['resource_type'] == f"({self.ocf_prefix}heartbeat:podman):")].empty: return 'podman' def resource_count(self, resource_type): return self.pcs_df[(self.pcs_df['resource_type'] == resource_type)][ 'resource_state'].count() def resource_count_in_state(self, resource_type, resource_state): return self.pcs_df[(self.pcs_df['resource_type'] == resource_type) & (self.pcs_df['resource_state'] == resource_state)][ 'resource_state'].count() def rabbitmq_resource_healthy(self): rabbitmq_resource_str = \ f"({self.ocf_prefix}heartbeat:rabbitmq-cluster):" nodes_num = self.resource_count(rabbitmq_resource_str) started_num = self.resource_count_in_state( rabbitmq_resource_str, "Started") if nodes_num == started_num and nodes_num > 0: LOG.info("pcs status check: resource rabbitmq is in healthy state") return True else: LOG.info("pcs status check: resource rabbitmq not in healthy " "state") return False def galera_resource_healthy(self): galera_resource_str = f"({self.ocf_prefix}heartbeat:galera):" nodes_num = self.resource_count(galera_resource_str) master_num = self.resource_count_in_state( galera_resource_str, self.promoted_status_str) if nodes_num == master_num and nodes_num > 0: LOG.info("pcs status check: resource galera is in healthy state") return True else: LOG.info("pcs status check: resource galera not in healthy state") return False def redis_resource_healthy(self): redis_resource_str = f"({self.ocf_prefix}heartbeat:redis):" if not overcloud.is_redis_expected(): LOG.info("redis resource not expected on OSP 17 " "and later releases by default") return self.pcs_df.query( f'resource_type == "{redis_resource_str}"').empty nodes_num = self.resource_count(redis_resource_str) master_num = self.resource_count_in_state( redis_resource_str, self.promoted_status_str) slave_num = self.resource_count_in_state( redis_resource_str, self.unpromoted_status_str) if (master_num == 1) and (slave_num == nodes_num - master_num): LOG.info("pcs status check: resource redis is in healthy state") return True else: LOG.info("pcs status check: resource redis not in healthy state") return False def vips_resource_healthy(self): if CONF.tobiko.rhosp.has_external_load_balancer: LOG.info("external load balancer used - " "we can skip vips_resource sanity") return True else: vips_resource_str = f"({self.ocf_prefix}heartbeat:IPaddr2):" nodes_num = self.resource_count(vips_resource_str) started_num = self.resource_count_in_state( vips_resource_str, "Started") if nodes_num == started_num and nodes_num > 0: LOG.info("pcs status check: resources vips are " "in healthy state") return True else: LOG.info( "pcs status check: resources" " vips are not in healthy state") return False def ha_proxy_cinder_healthy(self): if CONF.tobiko.rhosp.has_external_load_balancer: LOG.info("external load balancer used " "- we can skip ha_proxy_resource sanity") return True else: ha_proxy_resource_str = (f"({self.ocf_prefix}heartbeat:" f"{self.container_runtime()}):") nodes_num = self.resource_count(ha_proxy_resource_str) started_num = self.resource_count_in_state( ha_proxy_resource_str, "Started") if nodes_num == started_num and nodes_num > 0: LOG.info("pcs status check: resources ha_proxy and" " cinder are in healthy state") return True else: LOG.info( "pcs status check: resources ha_proxy and cinder " "are not in healthy state") return False def ovn_resource_healthy(self): ovn_resource_str = f"({self.ocf_prefix}ovn:ovndb-servers):" if self.pcs_df.query( f'resource_type == "{ovn_resource_str}"').empty: LOG.info('pcs status check: ovn is not deployed, skipping ovn ' 'resource check') return True nodes_num = self.resource_count(ovn_resource_str) master_num = self.resource_count_in_state( ovn_resource_str, self.promoted_status_str) slave_num = self.resource_count_in_state( ovn_resource_str, self.unpromoted_status_str) if (master_num == 1) and (slave_num == nodes_num - master_num): LOG.info( "pcs status check: resource ovn is in healthy state") return True else: LOG.info( "pcs status check: resource ovn is in not in " "healthy state") return False @property def all_healthy(self): """ check if each resource is in healthy order and return a global healthy status :return: Bool """ for attempt_number in range(360): try: if all([ self.rabbitmq_resource_healthy(), self.galera_resource_healthy(), self.redis_resource_healthy(), self.vips_resource_healthy(), self.ha_proxy_cinder_healthy(), self.ovn_resource_healthy() ]): LOG.info("pcs status checks: all resources are" " in healthy state") return True else: LOG.info("pcs status check: not all resources are " "in healthy " "state") raise PcsResourceException() except PcsResourceException: # reread pcs status LOG.info('Retrying pacemaker resource checks attempt ' '{} of 360'.format(attempt_number)) time.sleep(1) self.pcs_df = get_pcs_resources_table() # exhausted all retries tobiko.fail('pcs cluster is not in a healthy state') def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None, resource_state=None): """ Check what nodes are running the specified resource/type/state resource/type/state: exact str of a resource name as seen in pcs status :return: list of overcloud nodes """ # pylint: disable=no-member pcs_df = get_pcs_resources_table() if resource: pcs_df_query_resource = pcs_df.query('resource=="{}"'.format( resource)) return pcs_df_query_resource['overcloud_node'].unique().tolist() if resource_type and resource_state: pcs_df_query_resource_type_state = pcs_df.query( 'resource_type=="{}" and resource_state=="{}"'.format( resource_type, resource_state)) return pcs_df_query_resource_type_state[ 'overcloud_node'].unique().tolist() if resource_type and not resource_state: pcs_df_query_resource_type = pcs_df.query( 'resource_type=="{}"'.format(resource_type)) return pcs_df_query_resource_type['overcloud_node'].unique().tolist() def get_resource_master_node(resource_type=None): get_overcloud_nodes_running_pcs_resource( resource_type=resource_type, resource_state='Master') def get_ovn_db_master_node(): ocf_prefix, promoted_status_str, _ = get_pcs_prefix_and_status_values() return get_overcloud_nodes_running_pcs_resource( resource_type=f'({ocf_prefix}ovn:ovndb-servers):', resource_state=promoted_status_str) def get_overcloud_resource(resource_type=None, resource_state=None): """ Check what nodes are running the specified resource/type/state resource/type/state: exact str of a resource name as seen in pcs status :return: list of overcloud nodes """ pcs_df = get_pcs_resources_table() if resource_type and resource_state: pcs_df_query_resource_type_state = pcs_df.query( 'resource_type=="{}" and resource_state=="{}"'.format( resource_type, resource_state)) return pcs_df_query_resource_type_state[ 'resource'].unique().tolist() if resource_type and not resource_state: # pylint: disable=no-member pcs_df_query_resource_type = pcs_df.query( 'resource_type=="{}"'.format(resource_type)) return pcs_df_query_resource_type['resource'].unique().tolist() def instanceha_deployed(): """check IHA deployment checks for existence of the nova-evacuate resource""" if overcloud.has_overcloud(): return get_overcloud_nodes_running_pcs_resource( resource='nova-evacuate') else: return False skip_if_instanceha_not_delpoyed = tobiko.skip_unless( 'instanceha not delpoyed', instanceha_deployed) def fencing_deployed(): """check fencing deployment checks for existence of the stonith-fence type resources""" fencing_output = run_pcs_status(grep_str="stonith:fence_ipmilan") if fencing_output: return True else: return False skip_if_fencing_not_deployed = tobiko.skip_unless( 'fencing not delpoyed', fencing_deployed) def run_pcs_status(ssh_client: ssh.SSHClientFixture = None, options: list = None, grep_str: str = None) -> str: command_args = ['status'] command_args += options or [] output = execute_pcs(command_args, ssh_client=ssh_client, sudo=True) if not grep_str: return output output_ocf_lines = [] for line in output.splitlines(): if grep_str in line: output_ocf_lines.append(line) return '\n'.join(output_ocf_lines) class PcsResourceOperation(enum.Enum): DISABLE = "disable" ENABLE = "enable" RESTART = "restart" SHOW = "show" def __init__(self, pcsoperation: str): self.pcsoperation = pcsoperation DISABLE = PcsResourceOperation.DISABLE ENABLE = PcsResourceOperation.ENABLE RESTART = PcsResourceOperation.RESTART SHOW = PcsResourceOperation.SHOW def run_pcs_resource_operation(resource: str, operation: PcsResourceOperation, ssh_client: ssh.SSHClientFixture = None, node: str = None, operation_wait: int = 60, retry_timeout: float = 180., retry_interval: float = 5.) -> str: tobiko.check_valid_type(operation, PcsResourceOperation) command_args = ['resource', operation.pcsoperation, resource] if node is not None: command_args.append(node) command_args.append(f'--wait={operation_wait}') # add stderr to the output if the operation is disable or enable add_stderr = operation in (DISABLE, ENABLE) # execute the command with retries for attempt in tobiko.retry(timeout=retry_timeout, interval=retry_interval): try: output = execute_pcs(command_args, ssh_client=ssh_client, add_stderr=add_stderr, timeout=operation_wait + 10., sudo=True) except sh.ShellCommandFailed as exc: if attempt.is_last: raise exc else: LOG.info('the pcs command failed - retrying...') continue break return output PCS_COMMAND = sh.shell_command(['pcs']) def execute_pcs(command_args: list, ssh_client: ssh.SSHClientFixture = None, pcs_command: sh.ShellCommand = None, add_stderr: bool = False, timeout: float = 40., **execute_params) -> str: if ssh_client is None: ssh_client = topology.find_openstack_node( group='controller').ssh_client if pcs_command: pcs_command = sh.shell_command(pcs_command) else: pcs_command = PCS_COMMAND command = pcs_command + command_args result = sh.execute( command, ssh_client=ssh_client, stdin=False, stdout=True, stderr=True, timeout=timeout, **execute_params) if add_stderr: output = '\n'.join([result.stdout, result.stderr]) else: output = result.stdout return output