Fix node scaling for Ephemeral Heat

With Ephemeral Heat, we can no longer call
the Heat API while removing nodes, nor is it
necessary. This change removes all calls to
Heat and instead just executes the required
playbooks to perform the scale down actions.

Conflicts:
 tripleoclient/tests/v1/overcloud_node/test_overcloud_node.py
 tripleoclient/workflows/scale.py

Change-Id: Iba56d41d132275bd55e77290a6fca87b917de9e9
(cherry picked from commit 87dbe9f6d4)
This commit is contained in:
Brendan Shephard
2021-10-09 05:12:46 +00:00
parent d0a9648d81
commit 71633a13f9
3 changed files with 53 additions and 296 deletions

View File

@@ -56,15 +56,12 @@ class TestDeleteNode(fakes.TestDeleteNode):
self.addCleanup(wait_stack.stop)
self.app.client_manager.compute.servers.get.return_value = None
@mock.patch('tripleoclient.workflows.scale.remove_node_from_stack',
autospec=True)
@mock.patch('heatclient.common.event_utils.get_events',
autospec=True)
@mock.patch('tripleoclient.utils.run_ansible_playbook',
autospec=True)
def test_node_delete(self, mock_playbook,
mock_get_events,
mock_remove_stack):
mock_get_events):
argslist = ['instance1', 'instance2', '--stack', 'overcast',
'--timeout', '90', '--yes']
verifylist = [
@@ -106,15 +103,12 @@ class TestDeleteNode(fakes.TestDeleteNode):
self.cmd.take_action,
parsed_args)
@mock.patch('tripleoclient.workflows.scale.remove_node_from_stack',
autospec=True)
@mock.patch('heatclient.common.event_utils.get_events',
autospec=True)
@mock.patch('tripleoclient.utils.run_ansible_playbook',
autospec=True)
def test_node_delete_without_stack(self, mock_playbook,
mock_get_events,
mock_remove_stack):
mock_get_events):
arglist = ['instance1', '--yes']
verifylist = [
@@ -124,8 +118,8 @@ class TestDeleteNode(fakes.TestDeleteNode):
parsed_args = self.check_parser(self.cmd, arglist, verifylist)
self.cmd.take_action(parsed_args)
@mock.patch('tripleoclient.workflows.scale.remove_node_from_stack',
autospec=True)
@mock.patch('tripleoclient.utils.get_key')
@mock.patch('tripleoclient.utils.get_default_working_dir')
@mock.patch('heatclient.common.event_utils.get_events',
autospec=True)
@mock.patch('tripleoclient.utils.run_ansible_playbook',
@@ -135,7 +129,8 @@ class TestDeleteNode(fakes.TestDeleteNode):
mock_tempfile,
mock_playbook,
mock_get_events,
mock_remove_from_stack):
mock_dir,
mock_key):
bm_yaml = [{
'name': 'Compute',
@@ -164,6 +159,21 @@ class TestDeleteNode(fakes.TestDeleteNode):
tempfile.mkdtemp()
]
mock_dir.return_value = "/home/stack/overcloud-deploy"
ansible_dir = "{}/config-download/overcast".format(
mock_dir.return_value
)
inventory = "{}/tripleo-ansible-inventory.yaml".format(
ansible_dir
)
ansible_cfg = "{}/ansible.cfg".format(
ansible_dir
)
mock_key.return_value = '/home/stack/.ssh/id_rsa_tripleo'
unprovision_confirm = os.path.join(tmp, 'unprovision_confirm.json')
with open(unprovision_confirm, 'w') as confirm:
confirm.write(json.dumps([
@@ -225,43 +235,19 @@ class TestDeleteNode(fakes.TestDeleteNode):
},
),
mock.call(
playbook='cli-grant-local-access.yaml',
inventory='localhost,',
workdir=mock.ANY,
playbook_dir='/usr/share/ansible/tripleo-playbooks',
verbosity=mock.ANY,
extra_vars={
'access_path': os.path.join(os.environ.get('HOME'),
'config-download'),
'execution_user': mock.ANY},
),
mock.call(
playbook='cli-config-download.yaml',
inventory='localhost,',
workdir=mock.ANY,
playbook_dir='/usr/share/ansible/tripleo-playbooks',
verbosity=mock.ANY,
extra_vars=mock.ANY,
reproduce_command=True,
),
mock.call(
playbook=mock.ANY,
inventory=mock.ANY,
workdir=mock.ANY,
playbook_dir=mock.ANY,
skip_tags='opendev-validation',
ansible_cfg=None,
verbosity=mock.ANY,
playbook='scale_playbook.yaml',
inventory=inventory,
workdir=ansible_dir,
playbook_dir=ansible_dir,
ansible_cfg=ansible_cfg,
ssh_user='tripleo-admin',
key=mock.ANY,
limit_hosts='overcast-controller-1:overcast-compute-0',
ansible_timeout=42,
reproduce_command=True,
extra_env_variables={'ANSIBLE_BECOME': True},
extra_vars=None,
tags=None,
timeout=90,
forks=None
extra_env_variables={
"ANSIBLE_BECOME": True,
"ANSIBLE_PRIVATE_KEY_FILE":
"/home/stack/.ssh/id_rsa_tripleo"
}
),
mock.call(
inventory='localhost,',

View File

@@ -31,10 +31,8 @@ import yaml
from tripleoclient import command
from tripleoclient import constants
from tripleoclient.exceptions import InvalidConfiguration
from tripleoclient import utils as oooutils
from tripleoclient.workflows import baremetal
from tripleoclient.workflows import scale
class DeleteNode(command.Command):
@@ -133,7 +131,6 @@ class DeleteNode(command.Command):
def take_action(self, parsed_args):
self.log.debug("take_action(%s)" % parsed_args)
clients = self.app.client_manager
if parsed_args.baremetal_deployment:
with open(parsed_args.baremetal_deployment, 'r') as fp:
@@ -155,27 +152,31 @@ class DeleteNode(command.Command):
if not confirm:
raise oscexc.CommandError("Action not confirmed, exiting.")
orchestration_client = clients.orchestration
ansible_dir = os.path.join(oooutils.get_default_working_dir(
parsed_args.stack
),
'config-download',
parsed_args.stack)
stack = oooutils.get_stack(orchestration_client, parsed_args.stack)
inventory = os.path.join(ansible_dir,
'tripleo-ansible-inventory.yaml')
if not stack:
raise InvalidConfiguration("stack {} not found".format(
parsed_args.stack))
ansible_cfg = os.path.join(ansible_dir, 'ansible.cfg')
key_file = oooutils.get_key(parsed_args.stack)
print("Deleting the following nodes from stack {stack}:\n{nodes}"
.format(stack=stack.stack_name, nodes=nodes_text))
self._check_skiplist_exists(stack.environment())
scale.scale_down(
log=self.log,
clients=clients,
stack=stack,
nodes=nodes,
connection_timeout=parsed_args.overcloud_ssh_port_timeout,
timeout=parsed_args.timeout,
verbosity=oooutils.playbook_verbosity(self=self)
oooutils.run_ansible_playbook(
playbook='scale_playbook.yaml',
inventory=inventory,
workdir=ansible_dir,
playbook_dir=ansible_dir,
ansible_cfg=ansible_cfg,
ssh_user='tripleo-admin',
limit_hosts=':'.join('%s' % node for node in nodes),
reproduce_command=True,
extra_env_variables={
"ANSIBLE_BECOME": True,
"ANSIBLE_PRIVATE_KEY_FILE": key_file
}
)
if parsed_args.baremetal_deployment:

View File

@@ -1,230 +0,0 @@
# Copyright 2016 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import shutil
import tempfile
from heatclient.common import event_utils
from tripleoclient import constants
from tripleoclient import utils
from tripleoclient.workflows import deployment
def get_group_resources_after_delete(groupname, res_to_delete, resources):
group = next(res for res in resources if
res.resource_name == groupname and
res.resource_type == 'OS::Heat::ResourceGroup')
members = []
for res in resources:
stack_name, stack_id = next(
x['href'] for x in res.links if
x['rel'] == 'stack').rsplit('/', 2)[1:]
# desired new count of nodes after delete operation should be
# count of all existing nodes in ResourceGroup which are not
# in set of nodes being deleted. Also nodes in any delete state
# from a previous failed update operation are not included in
# overall count (if such nodes exist)
if (stack_id == group.physical_resource_id and
res not in res_to_delete and
not res.resource_status.startswith('DELETE')):
members.append(res)
return members
def _get_removal_params_from_heat(resources_by_role, resources):
stack_params = {}
for role, role_resources in resources_by_role.items():
param_name = "{0}Count".format(role)
# get real count of nodes for each role. *Count stack parameters
# can not be used because stack parameters return parameters
# passed by user no matter if previous update operation succeeded
# or not
group_members = get_group_resources_after_delete(
role, role_resources, resources)
stack_params[param_name] = str(len(group_members))
# add instance resource names into removal_policies
# so heat knows which instances should be removed
removal_param = "{0}RemovalPolicies".format(role)
stack_params[removal_param] = [{
'resource_list': [r.resource_name for r in role_resources]
}]
# force reset the removal_policies_mode to 'append'
# as 'update' can lead to deletion of unintended nodes.
removal_mode = "{0}RemovalPoliciesMode".format(role)
stack_params[removal_mode] = 'append'
return stack_params
def _match_hostname(heatclient, instance_list, res, stack_name):
type_patterns = ['DeployedServer', 'Server']
if any(res.resource_type.endswith(x) for x in type_patterns):
res_details = heatclient.resources.get(
stack_name, res.resource_name)
if 'name' in res_details.attributes:
try:
instance_list.remove(res_details.attributes['name'])
return True
except ValueError:
return False
return False
def remove_node_from_stack(clients, stack, nodes, timeout):
heat = clients.orchestration
resources = heat.resources.list(stack.stack_name,
nested_depth=5)
resources_by_role = collections.defaultdict(list)
instance_list = list(nodes)
for res in resources:
stack_name, stack_id = next(
x['href'] for x in res.links if
x['rel'] == 'stack').rsplit('/', 2)[1:]
try:
instance_list.remove(res.physical_resource_id)
except ValueError:
if not _match_hostname(heat, instance_list,
res, stack_name):
continue
# get resource to remove from resource group (it's parent resource
# of nova server)
role_resource = next(x for x in resources if
x.physical_resource_id == stack_id)
# get the role name which is parent resource name in Heat
role = role_resource.parent_resource
resources_by_role[role].append(role_resource)
resources_by_role = dict(resources_by_role)
if instance_list:
raise ValueError(
"Couldn't find following instances in stack %s: %s" %
(stack, ','.join(instance_list)))
# decrease count for each role (or resource group) and set removal
# policy for each resource group
stack_params = _get_removal_params_from_heat(
resources_by_role, resources)
try:
tht_tmp = tempfile.mkdtemp(prefix='tripleoclient-')
tht_root = "%s/tripleo-heat-templates" % tht_tmp
created_env_files = []
env_path = utils.create_breakpoint_cleanup_env(
tht_root, stack.stack_name)
created_env_files.extend(env_path)
param_env_path = utils.create_parameters_env(
stack_params, tht_root, stack.stack_name,
'scale-down-parameters.yaml')
created_env_files.extend(param_env_path)
env_files_tracker = []
env_files, _ = utils.process_multiple_environments(
created_env_files, tht_root,
constants.TRIPLEO_HEAT_TEMPLATES,
env_files_tracker=env_files_tracker)
stack_args = {
'stack_name': stack.stack_name,
'environment_files': env_files_tracker,
'files': env_files,
'timeout_mins': timeout,
'existing': True,
'clear_parameters': list(stack_params.keys())}
heat.stacks.update(stack.id, **stack_args)
finally:
shutil.rmtree(tht_tmp)
def scale_down(log, clients, stack, nodes, timeout=None, verbosity=0,
connection_timeout=None):
"""Unprovision and deletes overcloud nodes from a heat stack.
:param log: Logging object
:type log: Object
:param clients: Application client object.
:type clients: Object
:param stack: Heat Stack object
:type stack: Object
:param nodes: List of nodes to delete. If the node UUID is used the
UUID will be used to lookup the node name before being
passed through to the cleanup playbook.
:type nodes: List
:param timeout: Timeout to use when deleting nodes. If timeout is None
it will be set to 240 minutes.
:type timeout: Integer
:param verbosity: Verbosity level
:type verbosity: Integer
:param connection_timeout: Ansible connection timeout in seconds.
:type connection_timeout: Integer
"""
if not timeout:
timeout = 240
limit_list = list()
for node in nodes:
try:
_node = clients.compute.servers.get(node)
limit_list.append(_node.name)
except Exception:
limit_list.append(node)
if limit_list:
limit_list = ':'.join(limit_list)
else:
limit_list = None
deployment.config_download(
log=log,
clients=clients,
stack=stack,
timeout=connection_timeout,
ansible_playbook_name='scale_playbook.yaml',
limit_hosts=limit_list,
verbosity=verbosity,
deployment_timeout=timeout
)
events = event_utils.get_events(
clients.orchestration, stack_id=stack.stack_name,
event_args={'sort_dir': 'desc', 'limit': 1})
marker = events[0].id if events else None
print('Running scale down')
remove_node_from_stack(clients, stack, nodes, timeout)
utils.wait_for_stack_ready(
orchestration_client=clients.orchestration,
stack_name=stack.stack_name,
action='UPDATE',
marker=marker)