From 050c9aa99fc6a3818ae37fe6382318be9f59706d Mon Sep 17 00:00:00 2001 From: David Vallee Delisle Date: Thu, 13 May 2021 03:47:47 +0000 Subject: [PATCH] [train-only] post stack creation tsx validation RHEL-8.3 kernel disabled the Intel TSX (Transactional Synchronization Extensions) feature by default as a preemptive security measure, but it breaks live migration from RHEL-7.9 (or even RHEL-8.1 or RHEL-8.2) to RHEL-8.3. Operators are expected to explicitly define the TSX flag in their KernelArgs for the compute role to prevent live-migration issues during the upgrade process. This is explained in detail in this article [a] If operators don't want to add the TSX flag to the KernelArgs, they can always set "ForceNoTsx" to true. Adding this mandatory validation right after the stacks are updated is probably the earliest place where we can validate and fail if necessary. We'd rather fail quickly than too late as this will provide the best experience for our users. In addition to this, there's a tripleo-validation [b] in the work. This is meant to be train-only for now but we will have to refactor if (when?) we support FFU from queens to Wallaby+ [a] https://access.redhat.com/solutions/6036141 [b] https://review.opendev.org/c/openstack/tripleo-validations/+/790806 Co-Authored-By: Martin Schuppert Related: https://bugzilla.redhat.com/1923165 Closes-Bug: #1916758 Change-Id: I35246fbf74394f6e315973283464085d2aef08b2 --- .../tsx-validation-f663a9f14b2aff3d.yaml | 17 +++++++++ tripleoclient/exceptions.py | 4 +++ tripleoclient/v1/overcloud_deploy.py | 36 +++++++++++++++++++ tripleoclient/v1/overcloud_update.py | 8 +++++ tripleoclient/v1/overcloud_upgrade.py | 7 ++++ 5 files changed, 72 insertions(+) create mode 100644 releasenotes/notes/tsx-validation-f663a9f14b2aff3d.yaml diff --git a/releasenotes/notes/tsx-validation-f663a9f14b2aff3d.yaml b/releasenotes/notes/tsx-validation-f663a9f14b2aff3d.yaml new file mode 100644 index 000000000..6aea23bff --- /dev/null +++ b/releasenotes/notes/tsx-validation-f663a9f14b2aff3d.yaml @@ -0,0 +1,17 @@ +--- +fixes: + - | + RHEL-8.3 kernel disabled the Intel TSX (Transactional + Synchronization Extensions) feature by default as a preemptive + security measure, but it breaks live migration from RHEL-7.9 + (or even RHEL-8.1 or RHEL-8.2) to RHEL-8.3. + + Operators are expected to explicitly define the TSX flag in + their KernelArgs for the compute role to prevent live-migration + issues during the upgrade or update process. + + We now introduce this validation in tripleoclient to ensure + early failure. + + More information here: + https://access.redhat.com/solutions/6036141 diff --git a/tripleoclient/exceptions.py b/tripleoclient/exceptions.py index 55c8589a9..ee0cc8c58 100644 --- a/tripleoclient/exceptions.py +++ b/tripleoclient/exceptions.py @@ -147,3 +147,7 @@ class CellExportError(Base): class BannedParameters(Base): """Some of the environment parameters provided should be removed""" + + +class PostStackValidationError(Base): + """Stack validation failed""" diff --git a/tripleoclient/v1/overcloud_deploy.py b/tripleoclient/v1/overcloud_deploy.py index 3a31a103d..eb375817a 100644 --- a/tripleoclient/v1/overcloud_deploy.py +++ b/tripleoclient/v1/overcloud_deploy.py @@ -712,6 +712,42 @@ class DeployOvercloud(command.Command): roles=roles ) + def _post_stack_validation(self, stack): + """Post stack update mandatory validation + + Runs a validation in the to make sure that KernelArgs either + contains a TSX parameter or the ForceNoTsx parameter is defined. + This is a mandatory validation and it has to happen before + as soon as possible. + """ + + libvirt_service = "OS::TripleO::Services::NovaLibvirt" + services = filter(lambda x: (x.endswith('Services') and + libvirt_service in stack.parameters[x]), + stack.parameters) + impacted_roles = [] + for i in services: + role_name = re.sub('Services$', '', i) + role_param = stack.parameters.get(role_name + 'Parameters') + if role_param: + role_params = json.loads(role_param) + kernel_args = role_params.get('KernelArgs') + no_tsx = role_params.get('ForceNoTsx') + if (not no_tsx and + (not kernel_args or "tsx=" not in kernel_args)): + impacted_roles.append(role_name) + if len(impacted_roles): + self.log.error("Roles in the following list are expected to have " + "a TSX flag configured in their KernelArgs " + "parameter. For more information on why we must " + "explicitly define the TSX flag, please visit: " + "https://access.redhat.com/solutions/6036141") + self.log.error("You can also skip this validation by setting " + "ForceNoTsx parameter for the desired role(s)") + self.log.error("Impacted roles: {roles}".format( + roles=",".join(impacted_roles))) + raise exceptions.PostStackValidationError() + def get_parser(self, prog_name): # add_help doesn't work properly, set it to False: parser = argparse.ArgumentParser( diff --git a/tripleoclient/v1/overcloud_update.py b/tripleoclient/v1/overcloud_update.py index 25ccf8fa1..531d6c0ef 100644 --- a/tripleoclient/v1/overcloud_update.py +++ b/tripleoclient/v1/overcloud_update.py @@ -86,6 +86,14 @@ class UpdatePrepare(DeployOvercloud): super(UpdatePrepare, self).take_action(parsed_args) package_update.update(clients, container=stack_name) + + # "Mandatory" validation to make sure kernelargs contains + # a TSX flag + if not parsed_args.disable_validations: + stack = oooutils.get_stack(clients.orchestration, + parsed_args.stack) + self._post_stack_validation(stack) + package_update.get_config(clients, container=stack_name) self.log.info("Update init on stack {0} complete.".format( parsed_args.stack)) diff --git a/tripleoclient/v1/overcloud_upgrade.py b/tripleoclient/v1/overcloud_upgrade.py index 53a57d2f3..103f22ea5 100644 --- a/tripleoclient/v1/overcloud_upgrade.py +++ b/tripleoclient/v1/overcloud_upgrade.py @@ -102,6 +102,13 @@ class UpgradePrepare(DeployOvercloud): # DeployOvercloud. package_update.get_config(clients, container=stack_name) + # "Mandatory" validation to make sure kernelargs contains + # a TSX flag + if not parsed_args.disable_validations: + stack = oooutils.get_stack(clients.orchestration, + parsed_args.stack) + self._post_stack_validation(stack) + # enable ssh admin for Ansible-via-Mistral as that's done only # when config_download is true deployment.get_hosts_and_enable_ssh_admin(