From 71633a13f9ba58e2be11d0fe6911b4971a0e8fee Mon Sep 17 00:00:00 2001 From: Brendan Shephard Date: Sat, 9 Oct 2021 05:12:46 +0000 Subject: [PATCH] Fix node scaling for Ephemeral Heat With Ephemeral Heat, we can no longer call the Heat API while removing nodes, nor is it necessary. This change removes all calls to Heat and instead just executes the required playbooks to perform the scale down actions. Conflicts: tripleoclient/tests/v1/overcloud_node/test_overcloud_node.py tripleoclient/workflows/scale.py Change-Id: Iba56d41d132275bd55e77290a6fca87b917de9e9 (cherry picked from commit 87dbe9f6d4e79127681d183fba45b682b6e31778) --- .../v1/overcloud_node/test_overcloud_node.py | 76 +++--- tripleoclient/v1/overcloud_node.py | 43 ++-- tripleoclient/workflows/scale.py | 230 ------------------ 3 files changed, 53 insertions(+), 296 deletions(-) delete mode 100644 tripleoclient/workflows/scale.py diff --git a/tripleoclient/tests/v1/overcloud_node/test_overcloud_node.py b/tripleoclient/tests/v1/overcloud_node/test_overcloud_node.py index 4da98ae2e..3dbad82c9 100644 --- a/tripleoclient/tests/v1/overcloud_node/test_overcloud_node.py +++ b/tripleoclient/tests/v1/overcloud_node/test_overcloud_node.py @@ -56,15 +56,12 @@ class TestDeleteNode(fakes.TestDeleteNode): self.addCleanup(wait_stack.stop) self.app.client_manager.compute.servers.get.return_value = None - @mock.patch('tripleoclient.workflows.scale.remove_node_from_stack', - autospec=True) @mock.patch('heatclient.common.event_utils.get_events', autospec=True) @mock.patch('tripleoclient.utils.run_ansible_playbook', autospec=True) def test_node_delete(self, mock_playbook, - mock_get_events, - mock_remove_stack): + mock_get_events): argslist = ['instance1', 'instance2', '--stack', 'overcast', '--timeout', '90', '--yes'] verifylist = [ @@ -106,15 +103,12 @@ class TestDeleteNode(fakes.TestDeleteNode): self.cmd.take_action, parsed_args) - @mock.patch('tripleoclient.workflows.scale.remove_node_from_stack', - autospec=True) @mock.patch('heatclient.common.event_utils.get_events', autospec=True) @mock.patch('tripleoclient.utils.run_ansible_playbook', autospec=True) def test_node_delete_without_stack(self, mock_playbook, - mock_get_events, - mock_remove_stack): + mock_get_events): arglist = ['instance1', '--yes'] verifylist = [ @@ -124,8 +118,8 @@ class TestDeleteNode(fakes.TestDeleteNode): parsed_args = self.check_parser(self.cmd, arglist, verifylist) self.cmd.take_action(parsed_args) - @mock.patch('tripleoclient.workflows.scale.remove_node_from_stack', - autospec=True) + @mock.patch('tripleoclient.utils.get_key') + @mock.patch('tripleoclient.utils.get_default_working_dir') @mock.patch('heatclient.common.event_utils.get_events', autospec=True) @mock.patch('tripleoclient.utils.run_ansible_playbook', @@ -135,7 +129,8 @@ class TestDeleteNode(fakes.TestDeleteNode): mock_tempfile, mock_playbook, mock_get_events, - mock_remove_from_stack): + mock_dir, + mock_key): bm_yaml = [{ 'name': 'Compute', @@ -164,6 +159,21 @@ class TestDeleteNode(fakes.TestDeleteNode): tempfile.mkdtemp() ] + mock_dir.return_value = "/home/stack/overcloud-deploy" + ansible_dir = "{}/config-download/overcast".format( + mock_dir.return_value + ) + + inventory = "{}/tripleo-ansible-inventory.yaml".format( + ansible_dir + ) + + ansible_cfg = "{}/ansible.cfg".format( + ansible_dir + ) + + mock_key.return_value = '/home/stack/.ssh/id_rsa_tripleo' + unprovision_confirm = os.path.join(tmp, 'unprovision_confirm.json') with open(unprovision_confirm, 'w') as confirm: confirm.write(json.dumps([ @@ -225,43 +235,19 @@ class TestDeleteNode(fakes.TestDeleteNode): }, ), mock.call( - playbook='cli-grant-local-access.yaml', - inventory='localhost,', - workdir=mock.ANY, - playbook_dir='/usr/share/ansible/tripleo-playbooks', - verbosity=mock.ANY, - extra_vars={ - 'access_path': os.path.join(os.environ.get('HOME'), - 'config-download'), - 'execution_user': mock.ANY}, - ), - mock.call( - playbook='cli-config-download.yaml', - inventory='localhost,', - workdir=mock.ANY, - playbook_dir='/usr/share/ansible/tripleo-playbooks', - verbosity=mock.ANY, - extra_vars=mock.ANY, - reproduce_command=True, - ), - mock.call( - playbook=mock.ANY, - inventory=mock.ANY, - workdir=mock.ANY, - playbook_dir=mock.ANY, - skip_tags='opendev-validation', - ansible_cfg=None, - verbosity=mock.ANY, + playbook='scale_playbook.yaml', + inventory=inventory, + workdir=ansible_dir, + playbook_dir=ansible_dir, + ansible_cfg=ansible_cfg, ssh_user='tripleo-admin', - key=mock.ANY, limit_hosts='overcast-controller-1:overcast-compute-0', - ansible_timeout=42, reproduce_command=True, - extra_env_variables={'ANSIBLE_BECOME': True}, - extra_vars=None, - tags=None, - timeout=90, - forks=None + extra_env_variables={ + "ANSIBLE_BECOME": True, + "ANSIBLE_PRIVATE_KEY_FILE": + "/home/stack/.ssh/id_rsa_tripleo" + } ), mock.call( inventory='localhost,', diff --git a/tripleoclient/v1/overcloud_node.py b/tripleoclient/v1/overcloud_node.py index 4351b6994..59038d54a 100644 --- a/tripleoclient/v1/overcloud_node.py +++ b/tripleoclient/v1/overcloud_node.py @@ -31,10 +31,8 @@ import yaml from tripleoclient import command from tripleoclient import constants -from tripleoclient.exceptions import InvalidConfiguration from tripleoclient import utils as oooutils from tripleoclient.workflows import baremetal -from tripleoclient.workflows import scale class DeleteNode(command.Command): @@ -133,7 +131,6 @@ class DeleteNode(command.Command): def take_action(self, parsed_args): self.log.debug("take_action(%s)" % parsed_args) - clients = self.app.client_manager if parsed_args.baremetal_deployment: with open(parsed_args.baremetal_deployment, 'r') as fp: @@ -155,27 +152,31 @@ class DeleteNode(command.Command): if not confirm: raise oscexc.CommandError("Action not confirmed, exiting.") - orchestration_client = clients.orchestration + ansible_dir = os.path.join(oooutils.get_default_working_dir( + parsed_args.stack + ), + 'config-download', + parsed_args.stack) - stack = oooutils.get_stack(orchestration_client, parsed_args.stack) + inventory = os.path.join(ansible_dir, + 'tripleo-ansible-inventory.yaml') - if not stack: - raise InvalidConfiguration("stack {} not found".format( - parsed_args.stack)) + ansible_cfg = os.path.join(ansible_dir, 'ansible.cfg') + key_file = oooutils.get_key(parsed_args.stack) - print("Deleting the following nodes from stack {stack}:\n{nodes}" - .format(stack=stack.stack_name, nodes=nodes_text)) - - self._check_skiplist_exists(stack.environment()) - - scale.scale_down( - log=self.log, - clients=clients, - stack=stack, - nodes=nodes, - connection_timeout=parsed_args.overcloud_ssh_port_timeout, - timeout=parsed_args.timeout, - verbosity=oooutils.playbook_verbosity(self=self) + oooutils.run_ansible_playbook( + playbook='scale_playbook.yaml', + inventory=inventory, + workdir=ansible_dir, + playbook_dir=ansible_dir, + ansible_cfg=ansible_cfg, + ssh_user='tripleo-admin', + limit_hosts=':'.join('%s' % node for node in nodes), + reproduce_command=True, + extra_env_variables={ + "ANSIBLE_BECOME": True, + "ANSIBLE_PRIVATE_KEY_FILE": key_file + } ) if parsed_args.baremetal_deployment: diff --git a/tripleoclient/workflows/scale.py b/tripleoclient/workflows/scale.py deleted file mode 100644 index 76a3d3d96..000000000 --- a/tripleoclient/workflows/scale.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright 2016 Red Hat, Inc. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import collections -import shutil -import tempfile - -from heatclient.common import event_utils - -from tripleoclient import constants -from tripleoclient import utils -from tripleoclient.workflows import deployment - - -def get_group_resources_after_delete(groupname, res_to_delete, resources): - group = next(res for res in resources if - res.resource_name == groupname and - res.resource_type == 'OS::Heat::ResourceGroup') - members = [] - for res in resources: - stack_name, stack_id = next( - x['href'] for x in res.links if - x['rel'] == 'stack').rsplit('/', 2)[1:] - # desired new count of nodes after delete operation should be - # count of all existing nodes in ResourceGroup which are not - # in set of nodes being deleted. Also nodes in any delete state - # from a previous failed update operation are not included in - # overall count (if such nodes exist) - if (stack_id == group.physical_resource_id and - res not in res_to_delete and - not res.resource_status.startswith('DELETE')): - - members.append(res) - - return members - - -def _get_removal_params_from_heat(resources_by_role, resources): - stack_params = {} - for role, role_resources in resources_by_role.items(): - param_name = "{0}Count".format(role) - - # get real count of nodes for each role. *Count stack parameters - # can not be used because stack parameters return parameters - # passed by user no matter if previous update operation succeeded - # or not - group_members = get_group_resources_after_delete( - role, role_resources, resources) - stack_params[param_name] = str(len(group_members)) - - # add instance resource names into removal_policies - # so heat knows which instances should be removed - removal_param = "{0}RemovalPolicies".format(role) - stack_params[removal_param] = [{ - 'resource_list': [r.resource_name for r in role_resources] - }] - - # force reset the removal_policies_mode to 'append' - # as 'update' can lead to deletion of unintended nodes. - removal_mode = "{0}RemovalPoliciesMode".format(role) - stack_params[removal_mode] = 'append' - - return stack_params - - -def _match_hostname(heatclient, instance_list, res, stack_name): - type_patterns = ['DeployedServer', 'Server'] - if any(res.resource_type.endswith(x) for x in type_patterns): - res_details = heatclient.resources.get( - stack_name, res.resource_name) - if 'name' in res_details.attributes: - try: - instance_list.remove(res_details.attributes['name']) - return True - except ValueError: - return False - return False - - -def remove_node_from_stack(clients, stack, nodes, timeout): - heat = clients.orchestration - resources = heat.resources.list(stack.stack_name, - nested_depth=5) - resources_by_role = collections.defaultdict(list) - instance_list = list(nodes) - - for res in resources: - stack_name, stack_id = next( - x['href'] for x in res.links if - x['rel'] == 'stack').rsplit('/', 2)[1:] - - try: - instance_list.remove(res.physical_resource_id) - except ValueError: - if not _match_hostname(heat, instance_list, - res, stack_name): - continue - - # get resource to remove from resource group (it's parent resource - # of nova server) - role_resource = next(x for x in resources if - x.physical_resource_id == stack_id) - # get the role name which is parent resource name in Heat - role = role_resource.parent_resource - resources_by_role[role].append(role_resource) - - resources_by_role = dict(resources_by_role) - - if instance_list: - raise ValueError( - "Couldn't find following instances in stack %s: %s" % - (stack, ','.join(instance_list))) - - # decrease count for each role (or resource group) and set removal - # policy for each resource group - stack_params = _get_removal_params_from_heat( - resources_by_role, resources) - try: - tht_tmp = tempfile.mkdtemp(prefix='tripleoclient-') - tht_root = "%s/tripleo-heat-templates" % tht_tmp - - created_env_files = [] - env_path = utils.create_breakpoint_cleanup_env( - tht_root, stack.stack_name) - created_env_files.extend(env_path) - param_env_path = utils.create_parameters_env( - stack_params, tht_root, stack.stack_name, - 'scale-down-parameters.yaml') - created_env_files.extend(param_env_path) - env_files_tracker = [] - env_files, _ = utils.process_multiple_environments( - created_env_files, tht_root, - constants.TRIPLEO_HEAT_TEMPLATES, - env_files_tracker=env_files_tracker) - - stack_args = { - 'stack_name': stack.stack_name, - 'environment_files': env_files_tracker, - 'files': env_files, - 'timeout_mins': timeout, - 'existing': True, - 'clear_parameters': list(stack_params.keys())} - - heat.stacks.update(stack.id, **stack_args) - finally: - shutil.rmtree(tht_tmp) - - -def scale_down(log, clients, stack, nodes, timeout=None, verbosity=0, - connection_timeout=None): - """Unprovision and deletes overcloud nodes from a heat stack. - - :param log: Logging object - :type log: Object - - :param clients: Application client object. - :type clients: Object - - :param stack: Heat Stack object - :type stack: Object - - :param nodes: List of nodes to delete. If the node UUID is used the - UUID will be used to lookup the node name before being - passed through to the cleanup playbook. - :type nodes: List - - :param timeout: Timeout to use when deleting nodes. If timeout is None - it will be set to 240 minutes. - :type timeout: Integer - - :param verbosity: Verbosity level - :type verbosity: Integer - - :param connection_timeout: Ansible connection timeout in seconds. - :type connection_timeout: Integer - """ - - if not timeout: - timeout = 240 - - limit_list = list() - for node in nodes: - try: - _node = clients.compute.servers.get(node) - limit_list.append(_node.name) - except Exception: - limit_list.append(node) - - if limit_list: - limit_list = ':'.join(limit_list) - else: - limit_list = None - - deployment.config_download( - log=log, - clients=clients, - stack=stack, - timeout=connection_timeout, - ansible_playbook_name='scale_playbook.yaml', - limit_hosts=limit_list, - verbosity=verbosity, - deployment_timeout=timeout - ) - - events = event_utils.get_events( - clients.orchestration, stack_id=stack.stack_name, - event_args={'sort_dir': 'desc', 'limit': 1}) - marker = events[0].id if events else None - - print('Running scale down') - - remove_node_from_stack(clients, stack, nodes, timeout) - - utils.wait_for_stack_ready( - orchestration_client=clients.orchestration, - stack_name=stack.stack_name, - action='UPDATE', - marker=marker)