Browse Source

Merge "Cleanup when hitting MaxRetriesExceeded from no host_available" into stable/rocky

tags/18.2.2
Zuul 1 month ago
parent
commit
b79689fcbc

+ 25
- 7
nova/conductor/manager.py View File

@@ -534,6 +534,23 @@ class ComputeTaskManager(base.Base):
534 534
                     bdm.attachment_id = attachment['id']
535 535
                     bdm.save()
536 536
 
537
+    def _cleanup_when_reschedule_fails(
538
+            self, context, instance, exception, request_spec,
539
+            requested_networks):
540
+        """Set the instance state and clean up.
541
+
542
+        It is only used in case build_instance fails while rescheduling the
543
+        instance
544
+        """
545
+
546
+        updates = {'vm_state': vm_states.ERROR,
547
+                   'task_state': None}
548
+        self._set_vm_state_and_notify(
549
+            context, instance.uuid, 'build_instances', updates, exception,
550
+            request_spec)
551
+        self._cleanup_allocated_networks(
552
+            context, instance, requested_networks)
553
+
537 554
     # NOTE(danms): This is never cell-targeted because it is only used for
538 555
     # cellsv1 (which does not target cells directly) and n-cpu reschedules
539 556
     # (which go to the cell conductor and thus are always cell-specific).
@@ -614,11 +631,7 @@ class ComputeTaskManager(base.Base):
614 631
             # disabled in those cases.
615 632
             num_attempts = filter_properties.get(
616 633
                 'retry', {}).get('num_attempts', 1)
617
-            updates = {'vm_state': vm_states.ERROR, 'task_state': None}
618 634
             for instance in instances:
619
-                self._set_vm_state_and_notify(
620
-                    context, instance.uuid, 'build_instances', updates,
621
-                    exc, request_spec)
622 635
                 # If num_attempts > 1, we're in a reschedule and probably
623 636
                 # either hit NoValidHost or MaxRetriesExceeded. Either way,
624 637
                 # the build request should already be gone and we probably
@@ -631,8 +644,9 @@ class ComputeTaskManager(base.Base):
631 644
                         self._destroy_build_request(context, instance)
632 645
                     except exception.BuildRequestNotFound:
633 646
                         pass
634
-                self._cleanup_allocated_networks(
635
-                    context, instance, requested_networks)
647
+                self._cleanup_when_reschedule_fails(
648
+                    context, instance, exc, request_spec,
649
+                    requested_networks)
636 650
             return
637 651
 
638 652
         elevated = context.elevated()
@@ -673,7 +687,11 @@ class ComputeTaskManager(base.Base):
673 687
                     msg = ("Exhausted all hosts available for retrying build "
674 688
                            "failures for instance %(instance_uuid)s." %
675 689
                            {"instance_uuid": instance.uuid})
676
-                    raise exception.MaxRetriesExceeded(reason=msg)
690
+                    exc = exception.MaxRetriesExceeded(reason=msg)
691
+                    self._cleanup_when_reschedule_fails(
692
+                        context, instance, exc, request_spec,
693
+                        requested_networks)
694
+                    return
677 695
             instance.availability_zone = (
678 696
                 availability_zones.get_host_availability_zone(context,
679 697
                         host.service_host))

+ 12
- 23
nova/tests/functional/regressions/test_bug_1837955.py View File

@@ -29,6 +29,11 @@ class BuildRescheduleClaimFailsTestCase(
29 29
     """
30 30
     compute_driver = 'fake.SmallFakeDriver'
31 31
 
32
+    def setUp(self):
33
+        super(BuildRescheduleClaimFailsTestCase, self).setUp()
34
+        fake_notifier.stub_notifier(self)
35
+        self.addCleanup(fake_notifier.reset)
36
+
32 37
     def _wait_for_unversioned_notification(self, event_type):
33 38
         for x in range(20):  # wait up to 10 seconds
34 39
             for notification in fake_notifier.NOTIFICATIONS:
@@ -85,31 +90,15 @@ class BuildRescheduleClaimFailsTestCase(
85 90
             image_uuid=fake_image.get_valid_image_id(),
86 91
             networks=[{'port': nova_fixtures.NeutronFixture.port_1['id']}])
87 92
         server = self.api.post_server({'server': server})
88
-        # FIXME(mriedem): This is bug 1837955 where the status is stuck in
89
-        # BUILD rather than the vm_state being set to error and the task_state
90
-        # being set to None. Uncomment this when the bug is fixed.
91
-        # server = self._wait_for_state_change(self.api, server, 'ERROR')
93
+        server = self._wait_for_state_change(self.api, server, 'ERROR')
92 94
 
93 95
         # Wait for the MaxRetriesExceeded fault to be recorded.
94 96
         # set_vm_state_and_notify sets the vm_state to ERROR before the fault
95 97
         # is recorded but after the notification is sent. So wait for the
96 98
         # unversioned notification to show up and then get the fault.
97
-        # FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
98
-        # self._wait_for_unversioned_notification(
99
-        #     'compute_task.build_instances')
100
-        # server = self.api.get_server(server['id'])
101
-        # self.assertIn('fault', server)
102
-        # self.assertIn('Exceeded maximum number of retries',
103
-        #               server['fault']['message'])
104
-
105
-        # TODO(mriedem): Remove this when the bug is fixed. We need to assert
106
-        # something before the bug is fixed to show the failure so check the
107
-        # logs.
108
-        for x in range(20):
109
-            logs = self.stdlog.logger.output
110
-            if 'MaxRetriesExceeded' in logs:
111
-                break
112
-            time.sleep(.5)
113
-        else:
114
-            self.fail('Timed out waiting for MaxRetriesExceeded to show up '
115
-                      'in the logs.')
99
+        self._wait_for_unversioned_notification(
100
+            'compute_task.build_instances')
101
+        server = self.api.get_server(server['id'])
102
+        self.assertIn('fault', server)
103
+        self.assertIn('Exceeded maximum number of retries',
104
+                      server['fault']['message'])

+ 23
- 13
nova/tests/unit/conductor/test_conductor.py View File

@@ -702,28 +702,38 @@ class _BaseTaskTestCase(object):
702 702
             mock.call(self.context, instances[1].uuid)])
703 703
         self.assertFalse(mock_get_by_host.called)
704 704
 
705
-    @mock.patch("nova.scheduler.utils.claim_resources", return_value=False)
705
+    @mock.patch('nova.conductor.manager.ComputeTaskManager.'
706
+                '_set_vm_state_and_notify')
706 707
     @mock.patch.object(objects.Instance, 'save')
707
-    def test_build_instances_exhaust_host_list(self, _mock_save, mock_claim):
708
+    def test_build_instances_exhaust_host_list(self, _mock_save, mock_notify):
708 709
         # A list of three alternate hosts for one instance
709 710
         host_lists = copy.deepcopy(fake_host_lists_alt)
710 711
         instance = fake_instance.fake_instance_obj(self.context)
711 712
         image = {'fake-data': 'should_pass_silently'}
712
-        expected_claim_count = len(host_lists[0])
713 713
 
714 714
         # build_instances() is a cast, we need to wait for it to complete
715 715
         self.useFixture(cast_as_call.CastAsCall(self))
716
+
717
+        self.conductor.build_instances(
718
+            context=self.context,
719
+            instances=[instance], image=image,
720
+            filter_properties={},
721
+            admin_password='admin_password',
722
+            injected_files='injected_files',
723
+            requested_networks=None,
724
+            security_groups='security_groups',
725
+            block_device_mapping=None,
726
+            legacy_bdm=None,
727
+            host_lists=host_lists
728
+        )
729
+
716 730
         # Since claim_resources() is mocked to always return False, we will run
717
-        # out of alternate hosts, and MaxRetriesExceeded should be raised.
718
-        self.assertRaises(exc.MaxRetriesExceeded,
719
-                self.conductor.build_instances, context=self.context,
720
-                instances=[instance], image=image, filter_properties={},
721
-                admin_password='admin_password',
722
-                injected_files='injected_files', requested_networks=None,
723
-                security_groups='security_groups',
724
-                block_device_mapping=None, legacy_bdm=None,
725
-                host_lists=host_lists)
726
-        self.assertEqual(expected_claim_count, mock_claim.call_count)
731
+        # out of alternate hosts, and complain about MaxRetriesExceeded.
732
+        mock_notify.assert_called_once_with(
733
+            self.context, instance.uuid, 'build_instances',
734
+            test.MatchType(dict),  # updates
735
+            test.MatchType(exc.MaxRetriesExceeded),
736
+            test.MatchType(dict))  # request_spec
727 737
 
728 738
     @mock.patch.object(conductor_manager.ComputeTaskManager,
729 739
             '_destroy_build_request')

Loading…
Cancel
Save