Restore soft-deleted compute node with same uuid

There is a unique index on the compute_nodes.uuid column which means we can't have more than one compute_nodes record in the same DB with the same UUID even if one is soft deleted because the deleted column is not part of that unique index constraint. This is a problem with ironic nodes where the node is 1:1 with the compute node record, and when a node is undergoing maintenance the driver doesn't return it from get_available_nodes() so the ComputeManager.update_available_resource periodic task (soft) deletes the compute node record, but when the node is no longer under maintenance in ironic and the driver reports it, the ResourceTracker._init_compute_node code will fail to create the ComputeNode record again because of the duplicate uuid. This change handles the DBDuplicateEntry error in compute_node_create by finding the soft-deleted compute node with the same uuid and simply updating it to no longer be (soft) deleted. Closes-Bug: #1839560 Change-Id: Iafba419fe86446ffe636721f523fb619f8f787b3
2019-08-12 14:39:16 -04:00 · 2019-08-12 14:39:16 -04:00 · 8b007266f4
parent 89dd74ac7f
commit 8b007266f4
3 changed files with 75 additions and 25 deletions
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@ -30,6 +30,7 @@ from oslo_db.sqlalchemy import enginefacade
 from oslo_db.sqlalchemy import update_match
 from oslo_db.sqlalchemy import utils as sqlalchemyutils
 from oslo_log import log as logging
 from oslo_utils import excutils
 from oslo_utils import importutils
 from oslo_utils import timeutils
 from oslo_utils import uuidutils
@ -696,11 +697,54 @@ def compute_node_create(context, values):
    compute_node_ref = models.ComputeNode()
    compute_node_ref.update(values)
-    compute_node_ref.save(context.session)
+    try:
        compute_node_ref.save(context.session)
    except db_exc.DBDuplicateEntry:
        with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
            # Check to see if we have a (soft) deleted ComputeNode with the
            # same UUID and if so just update it and mark as no longer (soft)
            # deleted. See bug 1839560 for details.
            if 'uuid' in values:
                # Get a fresh context for a new DB session and allow it to
                # get a deleted record.
                ctxt = nova.context.get_admin_context(read_deleted='yes')
                compute_node_ref = _compute_node_get_and_update_deleted(
                    ctxt, values)
                # If we didn't get anything back we failed to find the node
                # by uuid and update it so re-raise the DBDuplicateEntry.
                if compute_node_ref:
                    err_ctx.reraise = False
    return compute_node_ref
@pick_context_manager_writer
 def _compute_node_get_and_update_deleted(context, values):
    """Find a ComputeNode by uuid, update and un-delete it.
    This is a special case from the ``compute_node_create`` method which
    needs to be separate to get a new Session.
    This method will update the ComputeNode, if found, to have deleted=0 and
    deleted_at=None values.
    :param context: request auth context which should be able to read deleted
        records
    :param values: values used to update the ComputeNode record - must include
        uuid
    :return: updated ComputeNode sqlalchemy model object if successfully found
        and updated, None otherwise
    """
    cn = model_query(
        context, models.ComputeNode).filter_by(uuid=values['uuid']).first()
    if cn:
        # Update with the provided values but un-soft-delete.
        update_values = copy.deepcopy(values)
        update_values['deleted'] = 0
        update_values['deleted_at'] = None
        return compute_node_update(context, cn.id, update_values)
@oslo_db_api.wrap_db_retry(max_retries=5, retry_on_deadlock=True)
@pick_context_manager_writer
 def compute_node_update(context, compute_id, values):
--- a/nova/tests/functional/regressions/test_bug_1839560.py
+++ b/nova/tests/functional/regressions/test_bug_1839560.py
@ -14,7 +14,6 @@ from oslo_log import log as logging
 from nova import context
 from nova.db import api as db_api
 from nova import exception
 from nova import objects
 from nova import test
 from nova.tests import fixtures as nova_fixtures
@ -93,30 +92,25 @@ class PeriodicNodeRecreateTestCase(test.TestCase,
        # Now stub the driver again to report node2 as being back and run
        # the periodic task.
        compute.manager.driver._set_nodes(['node1', 'node2'])
        LOG.info('Running update_available_resource which should bring back '
                 'node2.')
        compute.manager.update_available_resource(ctxt)
-        # FIXME(mriedem): This is bug 1839560 where the ResourceTracker fails
+        # The DBDuplicateEntry error should have been handled and resulted in
-        # to create a ComputeNode for node2 because of conflicting UUIDs.
+        # updating the (soft) deleted record to no longer be deleted.
        log = self.stdlog.logger.output
-        self.assertIn('Error updating resources for node node2', log)
+        self.assertNotIn('DBDuplicateEntry', log)
-        self.assertIn('DBDuplicateEntry', log)
+        # Should have two reported hypervisors again.
        # Should still only have one reported hypervisor (node1).
        hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
        self.assertEqual(1, len(hypervisors), hypervisors)
        # Test the workaround for bug 1839560 by archiving the deleted node2
        # compute_nodes table record which will allow the periodic to create a
        # new entry for node2. We can remove this when the bug is fixed.
        LOG.info('Archiving the database.')
        archived = db_api.archive_deleted_rows(1000)[0]
        self.assertIn('compute_nodes', archived)
        self.assertEqual(1, archived['compute_nodes'])
        with utils.temporary_mutation(ctxt, read_deleted='yes'):
            self.assertRaises(exception.ComputeHostNotFound,
                              objects.ComputeNode.get_by_host_and_nodename,
                              ctxt, 'node1', 'node2')
        # Now run the periodic again and we should have a new ComputeNode for
        # node2.
        LOG.info('Running update_available_resource which should create a new '
                 'ComputeNode record for node2.')
        compute.manager.update_available_resource(ctxt)
        hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
        self.assertEqual(2, len(hypervisors), hypervisors)
        # Now that the node2 record was un-soft-deleted, archiving should not
        # remove any compute_nodes.
        LOG.info('Archiving the database.')
        archived = db_api.archive_deleted_rows(1000)[0]
        self.assertNotIn('compute_nodes', archived)
        cn2 = objects.ComputeNode.get_by_host_and_nodename(
            ctxt, 'node1', 'node2')
        self.assertFalse(cn2.deleted)
        self.assertIsNone(cn2.deleted_at)
        # The node2 id and uuid should not have changed in the DB.
        self.assertEqual(cn.id, cn2.id)
        self.assertEqual(cn.uuid, cn2.uuid)
--- a/nova/tests/unit/db/test_db_api.py
+++ b/nova/tests/unit/db/test_db_api.py
@ -7215,6 +7215,18 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin):
        new_stats = jsonutils.loads(self.item['stats'])
        self.assertEqual(self.stats, new_stats)
    def test_compute_node_create_duplicate_host_hypervisor_hostname(self):
        """Tests to make sure that DBDuplicateEntry is raised when trying to
        create a duplicate ComputeNode with the same host and
        hypervisor_hostname values but different uuid values. This makes
        sure that when _compute_node_get_and_update_deleted returns None
        the DBDuplicateEntry is re-raised.
        """
        other_node = dict(self.compute_node_dict)
        other_node['uuid'] = uuidutils.generate_uuid()
        self.assertRaises(db_exc.DBDuplicateEntry,
                          db.compute_node_create, self.ctxt, other_node)
    def test_compute_node_get_all(self):
        nodes = db.compute_node_get_all(self.ctxt)
        self.assertEqual(1, len(nodes))