From 1b021665281b74c865d3571fc90772b52d70e467 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Mon, 12 Aug 2019 14:39:16 -0400 Subject: [PATCH] Restore soft-deleted compute node with same uuid There is a unique index on the compute_nodes.uuid column which means we can't have more than one compute_nodes record in the same DB with the same UUID even if one is soft deleted because the deleted column is not part of that unique index constraint. This is a problem with ironic nodes where the node is 1:1 with the compute node record, and when a node is undergoing maintenance the driver doesn't return it from get_available_nodes() so the ComputeManager.update_available_resource periodic task (soft) deletes the compute node record, but when the node is no longer under maintenance in ironic and the driver reports it, the ResourceTracker._init_compute_node code will fail to create the ComputeNode record again because of the duplicate uuid. This change handles the DBDuplicateEntry error in compute_node_create by finding the soft-deleted compute node with the same uuid and simply updating it to no longer be (soft) deleted. Closes-Bug: #1839560 Change-Id: Iafba419fe86446ffe636721f523fb619f8f787b3 (cherry picked from commit 8b007266f438ec0a5a797d05731cce6f2b155f4c) --- nova/db/sqlalchemy/api.py | 46 ++++++++++++++++++- .../regressions/test_bug_1839560.py | 42 ++++++++--------- nova/tests/unit/db/test_db_api.py | 12 +++++ 3 files changed, 75 insertions(+), 25 deletions(-) diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 44bc6974a49b..79bd2c03f5ac 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -30,6 +30,7 @@ from oslo_db.sqlalchemy import enginefacade from oslo_db.sqlalchemy import update_match from oslo_db.sqlalchemy import utils as sqlalchemyutils from oslo_log import log as logging +from oslo_utils import excutils from oslo_utils import importutils from oslo_utils import timeutils from oslo_utils import uuidutils @@ -700,11 +701,54 @@ def compute_node_create(context, values): compute_node_ref = models.ComputeNode() compute_node_ref.update(values) - compute_node_ref.save(context.session) + try: + compute_node_ref.save(context.session) + except db_exc.DBDuplicateEntry: + with excutils.save_and_reraise_exception(logger=LOG) as err_ctx: + # Check to see if we have a (soft) deleted ComputeNode with the + # same UUID and if so just update it and mark as no longer (soft) + # deleted. See bug 1839560 for details. + if 'uuid' in values: + # Get a fresh context for a new DB session and allow it to + # get a deleted record. + ctxt = nova.context.get_admin_context(read_deleted='yes') + compute_node_ref = _compute_node_get_and_update_deleted( + ctxt, values) + # If we didn't get anything back we failed to find the node + # by uuid and update it so re-raise the DBDuplicateEntry. + if compute_node_ref: + err_ctx.reraise = False return compute_node_ref +@pick_context_manager_writer +def _compute_node_get_and_update_deleted(context, values): + """Find a ComputeNode by uuid, update and un-delete it. + + This is a special case from the ``compute_node_create`` method which + needs to be separate to get a new Session. + + This method will update the ComputeNode, if found, to have deleted=0 and + deleted_at=None values. + + :param context: request auth context which should be able to read deleted + records + :param values: values used to update the ComputeNode record - must include + uuid + :return: updated ComputeNode sqlalchemy model object if successfully found + and updated, None otherwise + """ + cn = model_query( + context, models.ComputeNode).filter_by(uuid=values['uuid']).first() + if cn: + # Update with the provided values but un-soft-delete. + update_values = copy.deepcopy(values) + update_values['deleted'] = 0 + update_values['deleted_at'] = None + return compute_node_update(context, cn.id, update_values) + + @oslo_db_api.wrap_db_retry(max_retries=5, retry_on_deadlock=True) @pick_context_manager_writer def compute_node_update(context, compute_id, values): diff --git a/nova/tests/functional/regressions/test_bug_1839560.py b/nova/tests/functional/regressions/test_bug_1839560.py index 9474d1bd37a3..94e507e1ba8d 100644 --- a/nova/tests/functional/regressions/test_bug_1839560.py +++ b/nova/tests/functional/regressions/test_bug_1839560.py @@ -14,7 +14,6 @@ from oslo_log import log as logging from nova import context from nova.db import api as db_api -from nova import exception from nova import objects from nova import test from nova.tests import fixtures as nova_fixtures @@ -91,30 +90,25 @@ class PeriodicNodeRecreateTestCase(test.TestCase, # Now stub the driver again to report node2 as being back and run # the periodic task. compute.manager.driver._nodes = ['node1', 'node2'] + LOG.info('Running update_available_resource which should bring back ' + 'node2.') compute.manager.update_available_resource(ctxt) - # FIXME(mriedem): This is bug 1839560 where the ResourceTracker fails - # to create a ComputeNode for node2 because of conflicting UUIDs. + # The DBDuplicateEntry error should have been handled and resulted in + # updating the (soft) deleted record to no longer be deleted. log = self.stdlog.logger.output - self.assertIn('Error updating resources for node node2', log) - self.assertIn('DBDuplicateEntry', log) - # Should still only have one reported hypervisor (node1). - hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors'] - self.assertEqual(1, len(hypervisors), hypervisors) - # Test the workaround for bug 1839560 by archiving the deleted node2 - # compute_nodes table record which will allow the periodic to create a - # new entry for node2. We can remove this when the bug is fixed. - LOG.info('Archiving the database.') - archived = db_api.archive_deleted_rows(1000)[0] - self.assertIn('compute_nodes', archived) - self.assertEqual(1, archived['compute_nodes']) - with utils.temporary_mutation(ctxt, read_deleted='yes'): - self.assertRaises(exception.ComputeHostNotFound, - objects.ComputeNode.get_by_host_and_nodename, - ctxt, 'node1', 'node2') - # Now run the periodic again and we should have a new ComputeNode for - # node2. - LOG.info('Running update_available_resource which should create a new ' - 'ComputeNode record for node2.') - compute.manager.update_available_resource(ctxt) + self.assertNotIn('DBDuplicateEntry', log) + # Should have two reported hypervisors again. hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors'] self.assertEqual(2, len(hypervisors), hypervisors) + # Now that the node2 record was un-soft-deleted, archiving should not + # remove any compute_nodes. + LOG.info('Archiving the database.') + archived = db_api.archive_deleted_rows(1000)[0] + self.assertNotIn('compute_nodes', archived) + cn2 = objects.ComputeNode.get_by_host_and_nodename( + ctxt, 'node1', 'node2') + self.assertFalse(cn2.deleted) + self.assertIsNone(cn2.deleted_at) + # The node2 id and uuid should not have changed in the DB. + self.assertEqual(cn.id, cn2.id) + self.assertEqual(cn.uuid, cn2.uuid) diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index 1172212734e0..52014760822e 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -7014,6 +7014,18 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin): new_stats = jsonutils.loads(self.item['stats']) self.assertEqual(self.stats, new_stats) + def test_compute_node_create_duplicate_host_hypervisor_hostname(self): + """Tests to make sure that DBDuplicateEntry is raised when trying to + create a duplicate ComputeNode with the same host and + hypervisor_hostname values but different uuid values. This makes + sure that when _compute_node_get_and_update_deleted returns None + the DBDuplicateEntry is re-raised. + """ + other_node = dict(self.compute_node_dict) + other_node['uuid'] = uuidutils.generate_uuid() + self.assertRaises(db_exc.DBDuplicateEntry, + db.compute_node_create, self.ctxt, other_node) + def test_compute_node_get_all(self): nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(1, len(nodes))