Browse Source

Add functional regression recreate test for bug 1839560

This adds a functional test which recreates bug 1839560
where the driver reports a node, then no longer reports
it so the compute manager deletes it, and then the driver
reports it again later (this can be common with ironic
nodes as they undergo maintenance). The issue is that since
Ia69fabce8e7fd7de101e291fe133c6f5f5f7056a in Rocky, the
ironic node uuid is re-used for the compute node uuid but
there is a unique constraint on the compute node uuid so
when trying to create the compute node once the ironic node
is available again, the compute node create fails with a
duplicate entry error due to the duplicate uuid. To recreate
this in the functional test, a new fake virt driver is added
which provides a predictable uuid per node like the ironic
driver. The test also shows that archiving the database is
a way to workaround the bug until it's properly fixed.

Change-Id: If822509e906d5094f13a8700b2b9ed3c40580431
Related-Bug: #1839560
tags/20.0.0.0rc1
Matt Riedemann 2 months ago
parent
commit
89dd74ac7f
2 changed files with 136 additions and 0 deletions
  1. 122
    0
      nova/tests/functional/regressions/test_bug_1839560.py
  2. 14
    0
      nova/virt/fake.py

+ 122
- 0
nova/tests/functional/regressions/test_bug_1839560.py View File

@@ -0,0 +1,122 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+from oslo_log import log as logging
14
+
15
+from nova import context
16
+from nova.db import api as db_api
17
+from nova import exception
18
+from nova import objects
19
+from nova import test
20
+from nova.tests import fixtures as nova_fixtures
21
+from nova.tests.functional import fixtures as func_fixtures
22
+from nova.tests.functional import integrated_helpers
23
+from nova import utils
24
+
25
+LOG = logging.getLogger(__name__)
26
+
27
+
28
+class PeriodicNodeRecreateTestCase(test.TestCase,
29
+                                   integrated_helpers.InstanceHelperMixin):
30
+    """Regression test for bug 1839560 introduced in Rocky.
31
+
32
+    When an ironic node is undergoing maintenance the driver will not report
33
+    it as an available node to the ComputeManager.update_available_resource
34
+    periodic task. The ComputeManager will then (soft) delete a ComputeNode
35
+    record for that no-longer-available node. If/when the ironic node is
36
+    available again and the driver reports it, the ResourceTracker will attempt
37
+    to create a ComputeNode record for the ironic node.
38
+
39
+    The regression with change Ia69fabce8e7fd7de101e291fe133c6f5f5f7056a is
40
+    that the ironic node uuid is used as the ComputeNode.uuid and there is
41
+    a unique constraint on the ComputeNode.uuid value in the database. So
42
+    trying to create a ComputeNode with the same uuid (after the ironic node
43
+    comes back from being unavailable) fails with a DuplicateEntry error since
44
+    there is a (soft) deleted version of the ComputeNode with the same uuid
45
+    in the database.
46
+    """
47
+    def setUp(self):
48
+        super(PeriodicNodeRecreateTestCase, self).setUp()
49
+        # We need the PlacementFixture for the compute nodes to report in but
50
+        # otherwise don't care about placement for this test.
51
+        self.useFixture(func_fixtures.PlacementFixture())
52
+        # Start up the API so we can query the os-hypervisors API.
53
+        self.api = self.useFixture(nova_fixtures.OSAPIFixture(
54
+            api_version='v2.1')).admin_api
55
+        # Make sure we're using the fake driver that has predictable uuids
56
+        # for each node.
57
+        self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
58
+
59
+    def test_update_available_resource_node_recreate(self):
60
+        # First we create a compute service to manage a couple of fake nodes.
61
+        compute = self.start_service('compute', 'node1')
62
+        # When start_service runs, it will create the node1 ComputeNode.
63
+        compute.manager.driver._set_nodes(['node1', 'node2'])
64
+        # Run the update_available_resource periodic to register node2.
65
+        ctxt = context.get_admin_context()
66
+        compute.manager.update_available_resource(ctxt)
67
+        # Make sure no compute nodes were orphaned or deleted.
68
+        self.assertNotIn('Deleting orphan compute node',
69
+                         self.stdlog.logger.output)
70
+        # Now we should have two compute nodes, make sure the hypervisors API
71
+        # shows them.
72
+        hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
73
+        self.assertEqual(2, len(hypervisors), hypervisors)
74
+        self.assertEqual({'node1', 'node2'},
75
+                         set([hyp['hypervisor_hostname']
76
+                              for hyp in hypervisors]))
77
+        # Now stub the driver to only report node1. This is making it look like
78
+        # node2 is no longer available when update_available_resource runs.
79
+        compute.manager.driver._set_nodes(['node1'])
80
+        compute.manager.update_available_resource(ctxt)
81
+        # node2 should have been deleted, check the logs and API.
82
+        log = self.stdlog.logger.output
83
+        self.assertIn('Deleting orphan compute node', log)
84
+        self.assertIn('hypervisor host is node2', log)
85
+        hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
86
+        self.assertEqual(1, len(hypervisors), hypervisors)
87
+        self.assertEqual('node1', hypervisors[0]['hypervisor_hostname'])
88
+        # But the node2 ComputeNode is still in the database with deleted!=0.
89
+        with utils.temporary_mutation(ctxt, read_deleted='yes'):
90
+            cn = objects.ComputeNode.get_by_host_and_nodename(
91
+                ctxt, 'node1', 'node2')
92
+            self.assertTrue(cn.deleted)
93
+        # Now stub the driver again to report node2 as being back and run
94
+        # the periodic task.
95
+        compute.manager.driver._set_nodes(['node1', 'node2'])
96
+        compute.manager.update_available_resource(ctxt)
97
+        # FIXME(mriedem): This is bug 1839560 where the ResourceTracker fails
98
+        # to create a ComputeNode for node2 because of conflicting UUIDs.
99
+        log = self.stdlog.logger.output
100
+        self.assertIn('Error updating resources for node node2', log)
101
+        self.assertIn('DBDuplicateEntry', log)
102
+        # Should still only have one reported hypervisor (node1).
103
+        hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
104
+        self.assertEqual(1, len(hypervisors), hypervisors)
105
+        # Test the workaround for bug 1839560 by archiving the deleted node2
106
+        # compute_nodes table record which will allow the periodic to create a
107
+        # new entry for node2. We can remove this when the bug is fixed.
108
+        LOG.info('Archiving the database.')
109
+        archived = db_api.archive_deleted_rows(1000)[0]
110
+        self.assertIn('compute_nodes', archived)
111
+        self.assertEqual(1, archived['compute_nodes'])
112
+        with utils.temporary_mutation(ctxt, read_deleted='yes'):
113
+            self.assertRaises(exception.ComputeHostNotFound,
114
+                              objects.ComputeNode.get_by_host_and_nodename,
115
+                              ctxt, 'node1', 'node2')
116
+        # Now run the periodic again and we should have a new ComputeNode for
117
+        # node2.
118
+        LOG.info('Running update_available_resource which should create a new '
119
+                 'ComputeNode record for node2.')
120
+        compute.manager.update_available_resource(ctxt)
121
+        hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
122
+        self.assertEqual(2, len(hypervisors), hypervisors)

+ 14
- 0
nova/virt/fake.py View File

@@ -26,6 +26,7 @@ semantics of real hypervisor connections.
26 26
 import collections
27 27
 import contextlib
28 28
 import time
29
+import uuid
29 30
 
30 31
 import fixtures
31 32
 import os_resource_classes as orc
@@ -726,6 +727,19 @@ class FakeFinishMigrationFailDriver(FakeDriver):
726 727
         raise exception.VirtualInterfaceCreateException()
727 728
 
728 729
 
730
+class PredictableNodeUUIDDriver(SmallFakeDriver):
731
+    """SmallFakeDriver variant that reports a predictable node uuid in
732
+    get_available_resource, like IronicDriver.
733
+    """
734
+    def get_available_resource(self, nodename):
735
+        resources = super(
736
+            PredictableNodeUUIDDriver, self).get_available_resource(nodename)
737
+        # This is used in ComputeNode.update_from_virt_driver which is called
738
+        # from the ResourceTracker when creating a ComputeNode.
739
+        resources['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS, nodename)
740
+        return resources
741
+
742
+
729 743
 class FakeRescheduleDriver(FakeDriver):
730 744
     """FakeDriver derivative that triggers a reschedule on the first spawn
731 745
     attempt. This is expected to only be used in tests that have more than

Loading…
Cancel
Save