diff --git a/doc/source/actions/migrate.rst b/doc/source/actions/migrate.rst index 4e808cc0f..97cc9a536 100644 --- a/doc/source/actions/migrate.rst +++ b/doc/source/actions/migrate.rst @@ -35,3 +35,17 @@ parameter type required description If not specified, nova-scheduler will determine the destination ======================== ====== ======== =================================== + +Skipping conditions +-------------------- + +Migrate actions will be automatically skipped in the pre_condition phase in +the following cases: + +- The server does not exist +- The server is not running on the source_node + +On other conditions the action will be FAILED in the pre_condition check: + +- Destination node (if specified) does not exist or is disabled +- The server status is not ACTIVE for live migration diff --git a/releasenotes/notes/migration-precondition-check-ff55be4.yaml b/releasenotes/notes/migration-precondition-check-ff55be4.yaml new file mode 100644 index 000000000..998757275 --- /dev/null +++ b/releasenotes/notes/migration-precondition-check-ff55be4.yaml @@ -0,0 +1,17 @@ +--- +features: + - | + Added a pre-condition check to the migration action to prevent executing + migrations when required criteria are not met. In following conditions, + the action status will be set to SKIPPED: + + - Instance is not found + - Instance is not running on source node + + On other conditions the action will be FAILED in the pre_condition + check: + + - Destination node (if specified) does not exists or is disabled + - Instance is not ACTIVE and migration_type is live. + + This improves safety and correctness of migrations. diff --git a/watcher/applier/actions/migration.py b/watcher/applier/actions/migration.py index 4b326d12c..26cd5499f 100644 --- a/watcher/applier/actions/migration.py +++ b/watcher/applier/actions/migration.py @@ -199,9 +199,60 @@ class Migrate(base.BaseAction): raise exception.InstanceNotFound(name=self.instance_uuid) def pre_condition(self): - # TODO(jed): check if the instance exists / check if the instance is on - # the source_node - pass + """Check migration preconditions + + Skipping conditions: + - Instance does not exist + - Instance is not running on the source_node + Failing conditions: + - Destination node (if specified) does not exist or is disabled + - Instance status is not ACTIVE for live migration + """ + nova = nova_helper.NovaHelper(osc=self.osc) + + # Check that the instance exists + try: + instance = nova.find_instance(self.instance_uuid) + except nova_helper.nvexceptions.NotFound: + raise exception.ActionSkipped( + _("Instance %s not found") % self.instance_uuid) + + # Check that the instance is running on source_node + instance_host = getattr(instance, 'OS-EXT-SRV-ATTR:host', None) + if instance_host != self.source_node: + raise exception.ActionSkipped( + _("Instance %(instance)s is not running on source node " + "%(source)s (currently on %(current)s)") % + {'instance': self.instance_uuid, + 'source': self.source_node, + 'current': instance_host}) + + # Check destination node if specified + if self.destination_node: + try: + # Find the compute node and check if service is enabled + dest_node = nova.get_compute_node_by_hostname( + self.destination_node) + + # Check if compute service is enabled + if dest_node.status != 'enabled': + raise exception.ActionExecutionFailure( + _("Destination node %s is not in enabled state") % + self.destination_node) + except exception.ComputeNodeNotFound: + raise exception.ActionExecutionFailure( + _("Destination node %s not found") % + self.destination_node) + + # Check instance status based on migration type + instance_status = instance.status + if self.migration_type == self.LIVE_MIGRATION: + if instance_status != 'ACTIVE': + raise exception.ActionExecutionFailure( + _("Live migration requires instance %(instance)s to be " + "in ACTIVE status (current status: %(status)s)") % + {'instance': self.instance_uuid, + 'status': instance_status}) def post_condition(self): # TODO(jed): check extra parameters (network response, etc.) diff --git a/watcher/tests/applier/actions/test_migration.py b/watcher/tests/applier/actions/test_migration.py index 9546e408c..d941dc6b4 100644 --- a/watcher/tests/applier/actions/test_migration.py +++ b/watcher/tests/applier/actions/test_migration.py @@ -17,6 +17,9 @@ from unittest import mock import jsonschema +from novaclient.v2 import hypervisors +from novaclient.v2 import servers + from watcher.applier.actions import base as baction from watcher.applier.actions import migration from watcher.common import clients @@ -133,11 +136,163 @@ class TestMigration(base.TestCase): self.assertRaises(jsonschema.ValidationError, self.action.validate_parameters) - def test_migration_pre_condition(self): - try: - self.action.pre_condition() - except Exception as exc: - self.fail(exc) + def test_migration_pre_condition_success(self): + """Test successful pre_condition with all checks passing""" + parameters = {baction.BaseAction.RESOURCE_ID: self.INSTANCE_UUID, + 'migration_type': 'live', + 'source_node': 'compute1-hostname', + 'destination_node': 'compute2-hostname'} + self.action.input_parameters = parameters + instance_info = { + 'id': self.INSTANCE_UUID, + 'status': 'ACTIVE', + 'OS-EXT-SRV-ATTR:host': 'compute1-hostname' + } + instance = servers.Server(servers.ServerManager, info=instance_info) + + compute_node_info = { + 'id': 'compute2-hostname', + 'status': 'enabled', + 'hypervisor_hostname': 'compute1-hostname', + 'service': { + 'host': 'compute1-hostname' + } + } + compute_node = hypervisors.Hypervisor( + hypervisors.HypervisorManager, info=compute_node_info) + + self.m_helper.find_instance.return_value = instance + self.m_helper.get_compute_node_by_hostname.return_value = compute_node + + self.action.pre_condition() + + def test_pre_condition_instance_not_found(self): + """Test pre_condition fails when instance doesn't exist""" + self.m_helper.find_instance.side_effect = ( + nova_helper.nvexceptions.NotFound('404')) + + self.assertRaisesRegex( + exception.ActionSkipped, + f"Instance {self.INSTANCE_UUID} not found", + self.action.pre_condition) + + def test_pre_condition_instance_on_wrong_host(self): + """Test pre_condition fails when instance is on wrong host""" + instance_info = { + 'id': self.INSTANCE_UUID, + 'status': 'ACTIVE', + 'OS-EXT-SRV-ATTR:host': 'wrong-hostname' + } + instance = servers.Server(servers.ServerManager, info=instance_info) + + self.m_helper.find_instance.return_value = instance + + self.assertRaisesRegex( + exception.ActionSkipped, + f"Instance {self.INSTANCE_UUID} is not running on source node " + r"compute1-hostname \(currently on wrong-hostname\)", + self.action.pre_condition) + + def test_pre_condition_destination_node_not_found(self): + """Test pre_condition fails when destination node doesn't exist""" + instance_info = { + 'id': self.INSTANCE_UUID, + 'status': 'ACTIVE', + 'OS-EXT-SRV-ATTR:host': 'compute1-hostname' + } + instance = servers.Server(servers.ServerManager, info=instance_info) + + self.m_helper.find_instance.return_value = instance + self.m_helper.get_compute_node_by_hostname.side_effect = ( + exception.ComputeNodeNotFound(name='compute2-hostname')) + + self.assertRaisesRegex( + exception.ActionExecutionFailure, + "Destination node compute2-hostname not found", + self.action.pre_condition) + + def test_pre_condition_destination_node_disabled(self): + """Test pre_condition fails when destination node is disabled""" + instance_info = { + 'id': self.INSTANCE_UUID, + 'status': 'ACTIVE', + 'OS-EXT-SRV-ATTR:host': 'compute1-hostname' + } + instance = servers.Server(servers.ServerManager, info=instance_info) + + compute_node_info = { + 'id': 'compute2-hostname', + 'status': 'disabled', + 'hypervisor_hostname': 'compute2-hostname', + 'service': { + 'host': 'compute2-hostname' + } + } + compute_node = hypervisors.Hypervisor( + hypervisors.HypervisorManager, info=compute_node_info) + + self.m_helper.find_instance.return_value = instance + self.m_helper.get_compute_node_by_hostname.return_value = compute_node + + self.assertRaisesRegex( + exception.ActionExecutionFailure, + "Destination node compute2-hostname is not in enabled state", + self.action.pre_condition) + + def test_pre_condition_live_migration_wrong_status(self): + """Test pre_condition fails live migration with non-ACTIVE status""" + instance_info = { + 'id': self.INSTANCE_UUID, + 'status': 'SHUTOFF', + 'OS-EXT-SRV-ATTR:host': 'compute1-hostname' + } + instance = servers.Server(servers.ServerManager, info=instance_info) + + compute_node_info = { + 'id': 'compute2-hostname', + 'status': 'enabled', + 'hypervisor_hostname': 'compute2-hostname', + 'service': { + 'host': 'compute2-hostname' + } + } + compute_node = hypervisors.Hypervisor( + hypervisors.HypervisorManager, info=compute_node_info) + + self.m_helper.find_instance.return_value = instance + self.m_helper.get_compute_node_by_hostname.return_value = compute_node + + self.assertRaisesRegex( + exception.ActionExecutionFailure, + f"Live migration requires instance {self.INSTANCE_UUID} to be in " + r"ACTIVE status \(current status: SHUTOFF\)", + self.action.pre_condition) + + def test_pre_condition_no_destination_node(self): + """Test pre_condition with no destination node specified""" + instance_info = { + 'id': self.INSTANCE_UUID, + 'status': 'ACTIVE', + 'OS-EXT-SRV-ATTR:host': 'compute1-hostname' + } + instance = servers.Server(servers.ServerManager, info=instance_info) + + self.m_helper.find_instance.return_value = instance + + # Create action without destination_node + params = { + "migration_type": "live", + "source_node": "compute1-hostname", + "destination_node": None, + baction.BaseAction.RESOURCE_ID: self.INSTANCE_UUID, + } + action = migration.Migrate(mock.Mock()) + action.input_parameters = params + + action.pre_condition() + + # Ensure get_compute_node_by_hostname was not called + self.m_helper.get_compute_node_by_hostname.assert_not_called() def test_migration_post_condition(self): try: