From ecb5f1717191379b5ade160ae814f7da8b91e7e8 Mon Sep 17 00:00:00 2001 From: melanie witt Date: Fri, 28 Jan 2022 21:21:06 +0000 Subject: [PATCH] Raise InstanceNotFound on fkey constraint fail saving info cache We have seen tracebacks in the gate in the nova-multi-cell job during cross-cell migrations of a failure to save an instance info cache due to a foreign key constraint violation on instance_uuid. This can occur if we happen to receive an external event from neutron after the instance record has been moved to the destination cell. This adds a catch of DBReferenceError from oslo.db and raises InstanceNotFound if the key is instance_uuid, else it reraises. One more step is required to prevent tracing which is to log at level INFO instead of level ERROR in the case of InstanceNotFound. Then reraise it for callers to handle. Closes-Bug: #1944787 Change-Id: I71b7312879dab6857255c78c21ad8dd45d5ea00f --- nova/network/neutron.py | 13 +++++++ nova/objects/instance_info_cache.py | 23 ++++++++++-- nova/tests/unit/network/test_neutron.py | 35 +++++++++++++++++++ .../unit/objects/test_instance_info_cache.py | 25 +++++++++++++ 4 files changed, 93 insertions(+), 3 deletions(-) diff --git a/nova/network/neutron.py b/nova/network/neutron.py index e02da93b574d..774620c8c4d7 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -124,6 +124,19 @@ def update_instance_cache_with_nw_info(impl, context, instance, nw_info=None): ic.network_info = nw_info ic.save() instance.info_cache = ic + except exception.InstanceNotFound as e: + # The instance could have moved during a cross-cell migration when we + # receive an external event from neutron. Avoid logging a traceback + # when it happens. + msg = str(e) + if e.__class__.__name__.endswith('_Remote'): + # If this exception was raised remotely over RPC, the traceback(s) + # will be appended to the message. Truncate it in that case. + msg = utils.safe_truncate(msg.split('\n', 1)[0], 255) + LOG.info('Failed storing info cache due to: %s. ' + 'The instance may have moved to another cell during a ' + 'cross-cell migration', msg, instance=instance) + raise exception.InstanceNotFound(message=msg) except Exception: with excutils.save_and_reraise_exception(): LOG.exception('Failed storing info cache', instance=instance) diff --git a/nova/objects/instance_info_cache.py b/nova/objects/instance_info_cache.py index 06e0af3f3b24..3ab430baa1f5 100644 --- a/nova/objects/instance_info_cache.py +++ b/nova/objects/instance_info_cache.py @@ -12,6 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. +from oslo_db import exception as db_exc from oslo_log import log as logging from nova.db.main import api as db @@ -75,9 +76,25 @@ class InstanceInfoCache(base.NovaPersistentObject, base.NovaObject): if 'network_info' in self.obj_what_changed(): nw_info_json = self.fields['network_info'].to_primitive( self, 'network_info', self.network_info) - rv = db.instance_info_cache_update(self._context, - self.instance_uuid, - {'network_info': nw_info_json}) + + inst_uuid = self.instance_uuid + + try: + rv = db.instance_info_cache_update( + self._context, inst_uuid, {'network_info': nw_info_json}) + except db_exc.DBReferenceError as exp: + if exp.key != 'instance_uuid': + raise + # NOTE(melwitt): It is possible for us to fail here with a + # foreign key constraint violation on instance_uuid when we + # attempt to save the instance network info cache after + # receiving a network-changed external event from neutron + # during a cross-cell migration. This means the instance record + # is not found in this cell database and we can raise + # InstanceNotFound to signal that in a way that callers know + # how to handle. + raise exception.InstanceNotFound(instance_id=inst_uuid) + self._from_db_object(self._context, self, rv) self.obj_reset_changes() diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index 5056b70c4ec8..8cd8fc9ba827 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -7364,6 +7364,41 @@ class TestAPIModuleMethods(test.NoDBTestCase): self.assertEqual(networks, [{'id': 1}, {'id': 2}, {'id': 3}]) + @mock.patch('nova.network.neutron.LOG.info') + @mock.patch('nova.network.neutron.LOG.exception') + @mock.patch('nova.objects.instance_info_cache.InstanceInfoCache.save') + def test_update_instance_cache_with_nw_info_not_found(self, mock_save, + mock_log_exc, + mock_log_info): + """Tests that an attempt to update (save) the instance info cache will + not log a traceback but will reraise the exception for caller handling. + """ + # Simulate the oslo.messaging created "_Remote" subclass + # type we'll be catching. + class InstanceNotFound_Remote(exception.InstanceNotFound): + + def __init__(self, message=None, **kwargs): + super().__init__(message=message, **kwargs) + + # Simulate a long exception message containing tracebacks because + # oslo.messaging appends them. + message = 'Instance was not found.\n'.ljust(255, '*') + mock_save.side_effect = InstanceNotFound_Remote(message=message, + instance_id=uuids.inst) + api = neutronapi.API() + ctxt = context.get_context() + instance = fake_instance.fake_instance_obj(ctxt, uuid=uuids.i) + + self.assertRaises( + exception.InstanceNotFound, + neutronapi.update_instance_cache_with_nw_info, api, ctxt, instance, + nw_info=model.NetworkInfo()) + + # Verify we didn't log exception at level ERROR. + mock_log_exc.assert_not_called() + # Verify exception message was truncated before logging it. + self.assertLessEqual(len(mock_log_info.call_args.args[1]), 255) + class TestAPIPortbinding(TestAPIBase): diff --git a/nova/tests/unit/objects/test_instance_info_cache.py b/nova/tests/unit/objects/test_instance_info_cache.py index 2df596f5af96..13c1082ffcc2 100644 --- a/nova/tests/unit/objects/test_instance_info_cache.py +++ b/nova/tests/unit/objects/test_instance_info_cache.py @@ -15,6 +15,7 @@ import datetime import mock +from oslo_db import exception as db_exc from oslo_utils.fixture import uuidsentinel as uuids from oslo_utils import timeutils @@ -82,6 +83,30 @@ class _TestInstanceInfoCacheObject(object): self.assertEqual(timeutils.normalize_time(fake_updated_at), timeutils.normalize_time(obj.updated_at)) + @mock.patch.object(db, 'instance_info_cache_update') + def test_save_fkey_constraint_fail(self, mock_update): + fake_updated_at = datetime.datetime(2015, 1, 1) + nwinfo = network_model.NetworkInfo.hydrate([{'address': 'foo'}]) + nwinfo_json = nwinfo.json() + new_info_cache = fake_info_cache.copy() + new_info_cache['id'] = 1 + new_info_cache['updated_at'] = fake_updated_at + new_info_cache['network_info'] = nwinfo_json + + # We should see InstanceNotFound raised for fkey=instance_uuid + mock_update.side_effect = db_exc.DBReferenceError( + 'table', 'constraint', 'instance_uuid', 'key_table') + + obj = instance_info_cache.InstanceInfoCache(context=self.context) + obj.instance_uuid = uuids.info_instance + obj.network_info = nwinfo_json + self.assertRaises(exception.InstanceNotFound, obj.save) + + # We should see the original exception raised for any other fkey + mock_update.side_effect = db_exc.DBReferenceError( + 'table', 'constraint', 'otherkey', 'key_table') + self.assertRaises(db_exc.DBReferenceError, obj.save) + @mock.patch.object(db, 'instance_info_cache_get', return_value=fake_info_cache) def test_refresh(self, mock_get):