From 40472f49b3299751090d6c42b474f1d98fb9afda Mon Sep 17 00:00:00 2001 From: jgilaber Date: Thu, 26 Feb 2026 15:26:26 +0100 Subject: [PATCH] Fix exception handling for find_instance calls The nova_helper.find_instance method raises ComputeResourceNotFound when an instance is not found (via the @handle_nova_error decorator), but several callers were checking for None return values instead of catching the exception. This caused uncaught exceptions when instances were deleted between audit and action execution. Updated the following methods to properly catch ComputeResourceNotFound: - migration.Migrate.migrate() - migration.Migrate.abort() - resize.Resize.resize() - stop.Stop.stop() - volume_migration.VolumeMigrate._volume_can_swap() - zone_migration.ZoneMigration.volumes_migration() Also fixed the find_instance docstring to accurately document that it raises ComputeResourceNotFound instead of returning None, and added unit tests for the different actions for the scenario where the instance is not found. Closes-Bug: #2142208 Generated-By: claude-code (claude-opus-4-5) Change-Id: I69c50f50fb037bc2af13c6bbc3f42a794e46a80a Signed-off-by: jgilaber --- watcher/applier/actions/migration.py | 44 +++--- watcher/applier/actions/resize.py | 26 ++-- watcher/applier/actions/stop.py | 17 ++- watcher/applier/actions/volume_migration.py | 10 +- watcher/common/nova_helper.py | 3 +- .../strategy/strategies/zone_migration.py | 22 ++- .../unit/applier/actions/test_migration.py | 29 ++++ .../tests/unit/applier/actions/test_resize.py | 16 ++ .../tests/unit/applier/actions/test_stop.py | 3 +- .../applier/actions/test_volume_migration.py | 10 ++ .../strategies/test_zone_migration.py | 142 ++++++++++++++++++ 11 files changed, 275 insertions(+), 47 deletions(-) diff --git a/watcher/applier/actions/migration.py b/watcher/applier/actions/migration.py index 03a1e2411..14283a6fc 100644 --- a/watcher/applier/actions/migration.py +++ b/watcher/applier/actions/migration.py @@ -165,20 +165,21 @@ class Migrate(base.BaseAction): else: LOG.debug("Migrate instance %s to %s", self.instance_uuid, destination) - instance = nova.find_instance(self.instance_uuid) - if instance: - if self.migration_type == self.LIVE_MIGRATION: - return self._live_migrate_instance(nova, destination) - elif self.migration_type == self.COLD_MIGRATION: - return self._cold_migrate_instance(nova, destination) - else: - raise exception.Invalid( - message=(_("Migration of type '%(migration_type)s' is not " - "supported.") % - {'migration_type': self.migration_type})) - else: + try: + nova.find_instance(self.instance_uuid) + except exception.ComputeResourceNotFound: raise exception.InstanceNotFound(name=self.instance_uuid) + if self.migration_type == self.LIVE_MIGRATION: + return self._live_migrate_instance(nova, destination) + elif self.migration_type == self.COLD_MIGRATION: + return self._cold_migrate_instance(nova, destination) + else: + raise exception.Invalid( + message=(_("Migration of type '%(migration_type)s' is not " + "supported.") % + {'migration_type': self.migration_type})) + def execute(self): return self.migrate(destination=self.destination_node) @@ -187,17 +188,18 @@ class Migrate(base.BaseAction): def abort(self): nova = nova_helper.NovaHelper(osc=self.osc) - instance = nova.find_instance(self.instance_uuid) - if instance: - if self.migration_type == self.COLD_MIGRATION: - return self._abort_cold_migrate(nova) - elif self.migration_type == self.LIVE_MIGRATION: - return self._abort_live_migrate( - nova, source=self.source_node, - destination=self.destination_node) - else: + try: + nova.find_instance(self.instance_uuid) + except exception.ComputeResourceNotFound: raise exception.InstanceNotFound(name=self.instance_uuid) + if self.migration_type == self.COLD_MIGRATION: + return self._abort_cold_migrate(nova) + elif self.migration_type == self.LIVE_MIGRATION: + return self._abort_live_migrate( + nova, source=self.source_node, + destination=self.destination_node) + def pre_condition(self): """Check migration preconditions diff --git a/watcher/applier/actions/resize.py b/watcher/applier/actions/resize.py index 7c10841ea..11766c18e 100644 --- a/watcher/applier/actions/resize.py +++ b/watcher/applier/actions/resize.py @@ -79,17 +79,23 @@ class Resize(base.BaseAction): nova = nova_helper.NovaHelper(osc=self.osc) LOG.debug("Resize instance %s to %s flavor", self.instance_uuid, self.flavor) - instance = nova.find_instance(self.instance_uuid) + try: + nova.find_instance(self.instance_uuid) + except exception.ComputeResourceNotFound: + LOG.warning("Instance %s not found, skipping resize", + self.instance_uuid) + raise exception.InstanceNotFound(name=self.instance_uuid) + result = None - if instance: - try: - result = nova.resize_instance( - instance_id=self.instance_uuid, flavor=self.flavor) - except Exception as exc: - LOG.exception(exc) - LOG.critical( - "Unexpected error occurred. Resizing failed for " - "instance %s.", self.instance_uuid) + try: + result = nova.resize_instance( + instance_id=self.instance_uuid, flavor=self.flavor) + except Exception as exc: + LOG.exception(exc) + LOG.critical( + "Unexpected error occurred. Resizing failed for " + "instance %s.", self.instance_uuid) + return False return result def execute(self): diff --git a/watcher/applier/actions/stop.py b/watcher/applier/actions/stop.py index 925d70c6a..5c78b20f7 100644 --- a/watcher/applier/actions/stop.py +++ b/watcher/applier/actions/stop.py @@ -84,20 +84,21 @@ class Stop(base.BaseAction): return True else: # Check if failure was due to instance not found (idempotent) - instance = nova.find_instance(self.instance_uuid) - if not instance: + try: + nova.find_instance(self.instance_uuid) + except exception.ComputeResourceNotFound: LOG.info( "Instance %(uuid)s not found, " "considering stop operation successful", {'uuid': self.instance_uuid} ) return True - else: - LOG.error( - "Failed to stop instance %(uuid)s", - {'uuid': self.instance_uuid} - ) - return False + + LOG.error( + "Failed to stop instance %(uuid)s", + {'uuid': self.instance_uuid} + ) + return False def execute(self): return self.stop() diff --git a/watcher/applier/actions/volume_migration.py b/watcher/applier/actions/volume_migration.py index 04ae2effe..9160aac40 100644 --- a/watcher/applier/actions/volume_migration.py +++ b/watcher/applier/actions/volume_migration.py @@ -142,7 +142,15 @@ class VolumeMigrate(base.BaseAction): # since it has attachments we need to validate nova's constraints instance_id = volume.attachments[0]['server_id'] - instance_status = self.nova_util.find_instance(instance_id).status + try: + instance = self.nova_util.find_instance(instance_id) + except exception.ComputeResourceNotFound: + LOG.debug( + "Could not find instance %s, could not determine whether" + "it's safe to migrate", instance_id + ) + return False + instance_status = instance.status LOG.debug( f"volume: {volume.id} is attached to instance: {instance_id} " f"in instance status: {instance_status}") diff --git a/watcher/common/nova_helper.py b/watcher/common/nova_helper.py index ed1dbb3ef..7381a3097 100644 --- a/watcher/common/nova_helper.py +++ b/watcher/common/nova_helper.py @@ -640,7 +640,8 @@ class NovaHelper: """Find an instance by its ID. :param instance_id: the UUID of the instance to find - :returns: Server wrapper object if found, None if not found + :returns: Server wrapper object if found + :raises: ComputeResourceNotFound if instance is not found """ instance = self.connection.compute.get_server(instance_id) return Server.from_openstacksdk(instance) diff --git a/watcher/decision_engine/strategy/strategies/zone_migration.py b/watcher/decision_engine/strategy/strategies/zone_migration.py index 971606e99..5308bd61b 100644 --- a/watcher/decision_engine/strategy/strategies/zone_migration.py +++ b/watcher/decision_engine/strategy/strategies/zone_migration.py @@ -18,6 +18,7 @@ from oslo_utils import timeutils from cinderclient.v3.volumes import Volume from watcher._i18n import _ from watcher.common import cinder_helper +from watcher.common import exception from watcher.common import nova_helper from watcher.decision_engine.model import element from watcher.decision_engine.strategy.strategies import base @@ -424,11 +425,22 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy): # if with_attached_volume is True, migrate instances attached # to the volumes, only if they are possible migration targets if self.with_attached_volume: - instances = [ - self.nova.find_instance(dic.get('server_id')) - for dic in volume.attachments - if dic.get('server_id') in instance_target_ids - ] + instances = [] + for dic in volume.attachments: + instance_id = dic.get('server_id') + try: + instance = self.nova.find_instance(instance_id) + except exception.ComputeResourceNotFound: + # if an instance can't be found, we won't consider it + # for migration, but the audit should not fail because + # of it + LOG.debug( + "Could not find instance %s it will not be " + "considered for migration", instance_id + ) + continue + if instance_id in instance_target_ids: + instances.append(instance) self.instances_migration(instances, action_counter) action_counter.add_pool(pool) diff --git a/watcher/tests/unit/applier/actions/test_migration.py b/watcher/tests/unit/applier/actions/test_migration.py index ca4b601ae..202f15c4c 100644 --- a/watcher/tests/unit/applier/actions/test_migration.py +++ b/watcher/tests/unit/applier/actions/test_migration.py @@ -395,3 +395,32 @@ class TestMigration(test_utils.NovaResourcesMixin, base.TestCase): self.m_helper.abort_live_migrate.assert_called_once_with( instance_id=self.INSTANCE_UUID, source="compute1-hostname", destination="compute2-hostname") + + def test_execute_live_migration_failure_instance_not_found(self): + # Live migration fails but instance doesn't exist (idempotent) + self.m_helper.find_instance.side_effect = ( + exception.ComputeResourceNotFound(self.INSTANCE_UUID)) + self.m_helper.live_migrate_instance.return_value = False + + self.assertRaisesRegex( + exception.InstanceNotFound, + f"The instance '{self.INSTANCE_UUID}' could not be found", + self.action.execute + ) + + # Should check instance existence + self.m_helper.find_instance.assert_called_once_with(self.INSTANCE_UUID) + + def test_execute_cold_migration_failure_instance_not_found(self): + # Cold migration fails but instance doesn't exist (idempotent) + self.m_helper.find_instance.side_effect = ( + exception.ComputeResourceNotFound(self.INSTANCE_UUID)) + self.m_helper.watcher_non_live_migrate_instance.return_value = False + + self.assertRaisesRegex( + exception.InstanceNotFound, + f"The instance '{self.INSTANCE_UUID}' could not be found", + self.action.execute + ) + # Should check instance existence + self.m_helper.find_instance.assert_called_once_with(self.INSTANCE_UUID) diff --git a/watcher/tests/unit/applier/actions/test_resize.py b/watcher/tests/unit/applier/actions/test_resize.py index b672f24b1..4fb3e6df3 100644 --- a/watcher/tests/unit/applier/actions/test_resize.py +++ b/watcher/tests/unit/applier/actions/test_resize.py @@ -116,3 +116,19 @@ class TestResize(base.TestCase): exception.ActionExecutionFailure, "Flavor x1 not found", self.action.pre_condition) + + def test_execute_resize_failure_instance_not_found(self): + # Resize fails but instance doesn't exist (idempotent) + self.r_helper.find_instance.side_effect = ( + exception.ComputeResourceNotFound(self.INSTANCE_UUID) + ) + self.r_helper.resize_instance.return_value = False + + self.assertRaisesRegex( + exception.InstanceNotFound, + f"The instance '{self.INSTANCE_UUID}' could not be found", + self.action.execute + ) + + # Should check instance existence + self.r_helper.find_instance.assert_called_once_with(self.INSTANCE_UUID) diff --git a/watcher/tests/unit/applier/actions/test_stop.py b/watcher/tests/unit/applier/actions/test_stop.py index f406a017a..b09391126 100644 --- a/watcher/tests/unit/applier/actions/test_stop.py +++ b/watcher/tests/unit/applier/actions/test_stop.py @@ -135,7 +135,8 @@ class TestStop(test_utils.NovaResourcesMixin, base.TestCase): def test_execute_stop_failure_instance_not_found(self): # Stop operation fails but instance doesn't exist (idempotent) - self.m_helper.find_instance.return_value = None + self.m_helper.find_instance.side_effect = ( + exception.ComputeResourceNotFound()) self.m_helper.stop_instance.return_value = False result = self.action.execute() diff --git a/watcher/tests/unit/applier/actions/test_volume_migration.py b/watcher/tests/unit/applier/actions/test_volume_migration.py index 2706b49b8..c49b14159 100644 --- a/watcher/tests/unit/applier/actions/test_volume_migration.py +++ b/watcher/tests/unit/applier/actions/test_volume_migration.py @@ -20,6 +20,7 @@ from watcher.applier.actions import base as baction from watcher.applier.actions import volume_migration from watcher.common import cinder_helper from watcher.common import clients +from watcher.common import exception from watcher.common import keystone_helper from watcher.common import nova_helper from watcher.tests.unit import base @@ -208,6 +209,15 @@ class TestMigration(base.TestCase): result = self.action_swap._can_swap(volume) self.assertFalse(result) + def test_can_swap_instance_not_found(self): + volume = self.fake_volume( + status='in-use', attachments=[ + {'server_id': TestMigration.INSTANCE_UUID}]) + self.m_n_helper.find_instance.side_effect = ( + exception.ComputeResourceNotFound(TestMigration.INSTANCE_UUID)) + result = self.action_swap._can_swap(volume) + self.assertFalse(result) + def test_swap_success(self): volume = self.fake_volume( status='in-use', attachments=[ diff --git a/watcher/tests/unit/decision_engine/strategy/strategies/test_zone_migration.py b/watcher/tests/unit/decision_engine/strategy/strategies/test_zone_migration.py index 3309a7cf3..44ee4c34c 100644 --- a/watcher/tests/unit/decision_engine/strategy/strategies/test_zone_migration.py +++ b/watcher/tests/unit/decision_engine/strategy/strategies/test_zone_migration.py @@ -17,6 +17,7 @@ from unittest import mock import cinderclient +from watcher.common import exception from watcher.common import nova_helper from watcher.common import utils from watcher.decision_engine.strategy import strategies @@ -1288,6 +1289,147 @@ class TestZoneMigration(test_utils.NovaResourcesMixin, TestBaseStrategy): if item['name'] == "live_instance_migrate_ratio"][0] self.assertEqual(100, live_ind) + def test_execute_mixed_instances_volumes_with_attached_not_found(self): + """Test that the strategy handles properly an instance not found. + + If a instance is missing when checking the volume attachments, the + strategy execution should not fail, just skip that instance. The + instance should still be migrated, but in the regular order, as if + with_attached_volume was False. + """ + server_info = { + "compute_host": "src1", + "name": "INSTANCE_1", + "id": "d010ef1f-dc19-4982-9383-087498bfde03", + "vm_state": "active" + } + instance_on_src1_1 = nova_helper.Server.from_openstacksdk( + self.create_openstacksdk_server(**server_info) + ) + server_info = { + "compute_host": "src2", + "name": "INSTANCE_2", + "id": "d020ef1f-dc19-4982-9383-087498bfde03", + "vm_state": "active" + } + instance_on_src2_2 = nova_helper.Server.from_openstacksdk( + self.create_openstacksdk_server(**server_info) + ) + server_info = { + "compute_host": "src1", + "name": "INSTANCE_3", + "id": "d030ef1f-dc19-4982-9383-087498bfde03", + "vm_state": "active" + } + instance_on_src1_3 = nova_helper.Server.from_openstacksdk( + self.create_openstacksdk_server(**server_info) + ) + self.m_n_helper.get_instance_list.return_value = [ + instance_on_src1_1, + instance_on_src2_2, + instance_on_src1_3 + ] + + volume_on_src1_1 = self.fake_volume(host="src1@back1#pool1", + id=volume_uuid_mapping["volume_1"], + name="volume_1") + volume_on_src1_2 = self.fake_volume(host="src1@back1#pool1", + id=volume_uuid_mapping["volume_2"], + name="volume_2") + volume_on_src2_1 = self.fake_volume(host="src2@back1#pool1", + id=volume_uuid_mapping["volume_3"], + name="volume_3") + self.m_c_helper.get_volume_list.return_value = [ + volume_on_src1_1, + volume_on_src1_2, + volume_on_src2_1, + ] + + volume_on_src1_1.status = 'in-use' + volume_on_src1_1.attachments = [{ + "server_id": "d010ef1f-dc19-4982-9383-087498bfde03", + "attachment_id": "attachment1" + }] + + self.input_parameters["compute_nodes"] = [ + {"src_node": "src1", "dst_node": "dst1"}, + ] + self.input_parameters["storage_pools"] = [ + {"src_pool": "src1@back1#pool1", "dst_pool": "dst1@back1#pool1", + "src_type": "type1", "dst_type": "type1"}, + ] + self.input_parameters["with_attached_volume"] = True + + uuid = "d010ef1f-dc19-4982-9383-087498bfde03" + self.m_n_helper.find_instance.side_effect = ( + exception.ComputeResourceNotFound(uuid) + ) + + solution = self.strategy.execute() + # Check migrations + action_types = collections.Counter( + [action['action_type'] + for action in solution.actions]) + expected_vol_migrations = [ + {'action_type': 'volume_migrate', + 'input_parameters': + {'migration_type': 'migrate', + 'destination_node': 'dst1@back1#pool1', + 'resource_name': 'volume_1', + 'resource_id': '74454247-a064-4b34-8f43-89337987720e'}}, + {'action_type': 'volume_migrate', + 'input_parameters': + {'migration_type': 'migrate', + 'destination_node': 'dst1@back1#pool1', + 'resource_name': 'volume_2', + 'resource_id': 'a16c811e-2521-4fd3-8779-6a94ccb3be73'}} + ] + expected_vm_migrations = [ + {'action_type': 'migrate', + 'input_parameters': + {'migration_type': 'live', + 'source_node': 'src1', + 'destination_node': 'dst1', + 'resource_name': 'INSTANCE_1', + 'resource_id': 'd010ef1f-dc19-4982-9383-087498bfde03'}}, + {'action_type': 'migrate', + 'input_parameters': + {'migration_type': 'live', + 'source_node': 'src1', + 'destination_node': 'dst1', + 'resource_name': 'INSTANCE_3', + 'resource_id': 'd030ef1f-dc19-4982-9383-087498bfde03'}} + ] + migrated_volumes = [action + for action in solution.actions + if action['action_type'] == 'volume_migrate'] + self.assertEqual(2, action_types.get("volume_migrate", 0)) + self.assertEqual(expected_vol_migrations, migrated_volumes) + migrated_vms = [action + for action in solution.actions + if action['action_type'] == 'migrate'] + self.assertEqual(2, action_types.get("migrate", 0)) + self.assertEqual(expected_vm_migrations, migrated_vms) + + self.assertEqual(2, action_types.get("migrate", 0)) + # check that the live migration is the third action, after the + # volume migrations + second_action = solution.actions[2]['input_parameters'] + self.assertEqual(second_action['migration_type'], 'live') + + # All the detached volumes in the pool should be migrated + volume_mig_ind = [item['value'] for item in solution.global_efficacy + if item['name'] == "volume_migrate_ratio"][0] + self.assertEqual(100, volume_mig_ind) + # All the attached volumes in the pool should be swapped + volume_swap_ind = [item['value'] for item in solution.global_efficacy + if item['name'] == "volume_update_ratio"][0] + self.assertEqual(100, volume_swap_ind) + # All the instances in src1 should be migrated + live_ind = [item['value'] for item in solution.global_efficacy + if item['name'] == "live_instance_migrate_ratio"][0] + self.assertEqual(100, live_ind) + def test_instance_migration_exists(self): fake_actions = [