Add check to NodeDeletionTaskManager for controllers to be deployed

In case we remove controller from ha env we must redeploy others which
is done in the manager. Check added to list of retrieved for deployment
controllers in order to make sure they are in appropriate for the
operation status. In opposite case we might faced bugs in particular
situations, for example, when environment is reset and implicit
deployment is started on nodes which still are undergoing the process and
that leads to unexpected errorfull behaviour as such described in the bug

Change-Id: I099660db1f8cd9d179f377799231635865286d3a
Closes-Bug: #1432615
This commit is contained in:
Artem Roma 2015-08-12 17:40:14 +03:00
parent 6065c6771f
commit 8943ada643
5 changed files with 122 additions and 6 deletions

View File

@ -30,6 +30,7 @@ from nailgun.api.v1.validators.network import NetAssignmentValidator
from nailgun.api.v1.validators.node import NodeValidator
from nailgun import consts
from nailgun.errors import errors
from nailgun import objects
from nailgun.objects.serializers.node import NodeInterfacesSerializer
@ -58,11 +59,16 @@ class NodeHandler(SingleHandler):
* 202 (node is successfully scheduled for deletion)
* 400 (data validation failed)
* 404 (node not found in db)
* 403 (on of the controllers is in error state)
"""
node = self.get_object_or_404(self.single, obj_id)
task_manager = NodeDeletionTaskManager(cluster_id=node.cluster_id)
task = task_manager.execute([node], mclient_remove=False)
try:
task = task_manager.execute([node], mclient_remove=False)
except errors.ControllerInErrorState as e:
raise self.http(403, e.message)
self.raise_task(task)
@ -130,6 +136,7 @@ class NodeCollectionHandler(CollectionHandler):
* 202 (nodes are successfully scheduled for deletion)
* 400 (data validation failed)
* 404 (nodes not found in db)
* 403 (on of the controllers is in error state)
"""
# TODO(pkaminski): web.py does not support parsing of array arguments
# in the queryset so we specify the input as comma-separated list
@ -141,7 +148,11 @@ class NodeCollectionHandler(CollectionHandler):
nodes = self.get_objects_list_or_404(self.collection, node_ids)
task_manager = NodeDeletionTaskManager(cluster_id=nodes[0].cluster_id)
task = task_manager.execute(nodes, mclient_remove=False)
try:
task = task_manager.execute(nodes, mclient_remove=False)
except errors.ControllerInErrorState as e:
raise self.http(403, e.message)
self.raise_task(task)

View File

@ -51,6 +51,9 @@ default_messages = {
"InvalidOperatingSystem": "Invalid operating system",
"CannotFindPluginForRelease": "Cannot find plugin for the release",
"UnavailableRelease": "Release is unavailable",
"ControllerInErrorState": ("One of the cluster controllers is in error "
"state, please, eliminate the problem prior "
"to proceeding further"),
# mongo errors
"ExtMongoCheckerError": "Mongo nodes shouldn`t be used with external mongo",

View File

@ -1136,17 +1136,32 @@ class NodeDeletionTaskManager(TaskManager, DeploymentCheckMixin):
objects.Cluster.adjust_nodes_lists_on_controller_removing(
self.cluster, nodes_to_delete, nodes_to_deploy)
if nodes_to_deploy:
# NOTE(aroma): in case of removing of a controller node we do
# implicit redeployment of all left controllers here in
# order to preserve consistency of a HA cluster.
# The reason following filtering is added is that we must
# redeploy only controllers in ready status. Also in case
# one of the nodes is in error state we must cancel the whole
# operation as result of the redeployment in this case is unpredictable
# and user may end up with not working cluster
controllers_with_ready_status = []
for controller in nodes_to_deploy:
if controller.status == consts.NODE_STATUSES.error:
raise errors.ControllerInErrorState()
elif controller.status == consts.NODE_STATUSES.ready:
controllers_with_ready_status.append(controller)
if controllers_with_ready_status:
logger.debug("There are nodes to deploy: %s",
" ".join([objects.Node.get_node_fqdn(n)
for n in nodes_to_deploy]))
for n in controllers_with_ready_status]))
task_deployment = task.create_subtask(
consts.TASK_NAMES.deployment)
deployment_message = self._call_silently(
task_deployment,
tasks.DeploymentTask,
nodes_to_deploy,
controllers_with_ready_status,
method_name='message'
)
db().flush()

View File

@ -16,6 +16,7 @@
import logging
from nailgun import consts
from nailgun import objects
from nailgun.db.sqlalchemy.models import IPAddr
@ -117,3 +118,41 @@ class TestNodeDeletion(BaseIntegrationTest):
all([node['mclient_remove'] is False
for node in msg['args']['nodes']])
)
class TestNodeDeletionBadRequest(BaseIntegrationTest):
def test_node_handlers_deletion_bad_request(self):
self.env.create(nodes_kwargs=[
{'roles': ['controller'], 'status': consts.NODE_STATUSES.error}
])
cluster_db = self.env.clusters[0]
node_to_delete = self.env.create_node(
cluster_id=cluster_db.id,
roles=['controller'],
status=consts.NODE_STATUSES.ready
)
err_msg = ("One of the cluster controllers is in error state, "
"please, eliminate the problem prior to proceeding further")
resp = self.app.delete(
reverse(
'NodeHandler',
kwargs={'obj_id': node_to_delete.id}),
headers=self.default_headers,
expect_errors=True
)
self.assertEqual(resp.status_code, 403)
self.assertIn(err_msg, resp.body)
url = reverse('NodeCollectionHandler')
query_str = 'ids={0}'.format(node_to_delete.id)
resp = self.app.delete(
'{0}?{1}'.format(url, query_str),
headers=self.default_headers,
expect_errors=True
)
self.assertEqual(resp.status_code, 403)
self.assertIn(err_msg, resp.body)

View File

@ -791,6 +791,53 @@ class TestTaskManagers(BaseIntegrationTest):
self.assertRaises(
errors.InvalidData, manager_.execute, cluster_db.nodes)
@mock.patch('nailgun.task.manager.rpc.cast')
def test_node_deletion_redeploy_started_for_proper_controllers(self,
mcast):
self.env.create(nodes_kwargs=[
{'roles': ['controller'],
'status': consts.NODE_STATUSES.provisioned},
{'roles': ['controller'],
'status': consts.NODE_STATUSES.discover},
])
cluster_db = self.env.clusters[0]
node_to_delete = self.env.create_node(
cluster_id=cluster_db.id,
roles=['controller'],
status=consts.NODE_STATUSES.ready
)
node_to_deploy = self.env.create_node(
cluster_id=cluster_db.id,
roles=['controller'],
status=consts.NODE_STATUSES.ready
)
manager_ = manager.NodeDeletionTaskManager(cluster_id=cluster_db.id)
manager_.execute([node_to_delete])
args, kwargs = mcast.call_args_list[0]
depl_info = args[1][0]['args']['deployment_info']
self.assertEqual(node_to_deploy.uid, depl_info[0]['uid'])
def test_node_deletion_task_failed_with_controller_in_error(self):
self.env.create(nodes_kwargs=[
{'roles': ['controller'],
'status': consts.NODE_STATUSES.error},
])
cluster_db = self.env.clusters[0]
node_to_delete = self.env.create_node(
cluster_id=cluster_db.id,
roles=['controller'],
status=consts.NODE_STATUSES.ready
)
manager_ = manager.NodeDeletionTaskManager(cluster_id=cluster_db.id)
self.assertRaises(errors.ControllerInErrorState,
manager_.execute, [node_to_delete])
@fake_tasks()
def test_deployment_on_controller_removal_via_apply_changes(self):
self.env.create(
@ -877,7 +924,8 @@ class TestTaskManagers(BaseIntegrationTest):
@mock.patch('nailgun.rpc.cast')
def test_delete_nodes_reelection_if_primary_for_deletion(self, _):
self.env.create(
nodes_kwargs=[{'roles': ['controller']}] * 3)
nodes_kwargs=[{'roles': ['controller'],
'status': consts.NODE_STATUSES.ready}] * 3)
cluster = self.env.clusters[0]
task_manager = manager.NodeDeletionTaskManager(cluster_id=cluster.id)
objects.Cluster.set_primary_roles(cluster, self.env.nodes)