Get instances from conductor in init_host.

Update compute's init_host() to get the list of instances on this host via the conductor service. This removes a db read from compute. Most of the test fixes are to ensure conductor is running anywhere that compute is running. The EC2 availability zones test change (from 13 to 15) is because this output includes info on each service that's running, so running nova-conductor in this test suite added more info here. Note that this uses the previously-added ping() call in conductor to determine when the service is available. The compute manager pings the conductor every ten seconds for ten attempts and then falls back to the default configured RPC timeout. This should be a reasonable compromise between requiring strict service startup ordering and extreme delays. Part of blueprint no-db-compute. Change-Id: Ie2953f7ae79819a1b6e24e8997ed4332fd4d2356
2012-11-29 22:03:34 -05:00 · 2012-11-29 22:03:34 -05:00 · 60965a50bc
parent e1c7b18c7f
commit 60965a50bc
8 changed files with 64 additions and 7 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -66,6 +66,7 @@ from nova.openstack.common import lockutils
 from nova.openstack.common import log as logging
 from nova.openstack.common.notifier import api as notifier
 from nova.openstack.common import rpc
+from nova.openstack.common.rpc import common as rpc_common
 from nova.openstack.common import timeutils
 from nova import quota
 from nova.scheduler import rpcapi as scheduler_rpcapi
@ -359,6 +360,32 @@ class ComputeManager(manager.SchedulerDependentManager):
                        'trying to set it to ERROR'),
                      instance_uuid=instance_uuid)

+    def _get_instances_at_startup(self, context):
+        '''Get instances for this host during service init.'''
+        attempt = 0
+        timeout = 10
+        while True:
+            # NOTE(danms): Try ten times with a short timeout, and then punt
+            # to the configured RPC timeout after that
+            if attempt == 10:
+                timeout = None
+            attempt += 1
+
+            # NOTE(russellb): This is running during service startup. If we
+            # allow an exception to be raised, the service will shut down.
+            # This may fail the first time around if nova-conductor wasn't
+            # running when nova-compute started.
+            try:
+                self.conductor_api.ping(context, '1.21 GigaWatts',
+                                        timeout=timeout)
+                break
+            except rpc_common.Timeout as e:
+                LOG.exception(_('Timed out waiting for nova-conductor. '
+                                'Is it running? Or did nova-compute start '
+                                'before nova-conductor?'))
+
+        return self.conductor_api.instance_get_all_by_host(context, self.host)
+
    def _init_instance(self, context, instance):
        '''Initialize this instance during service init.'''
        db_state = instance['power_state']
@ -417,10 +444,7 @@ class ComputeManager(manager.SchedulerDependentManager):
        """Initialization for a standalone compute service."""
        self.driver.init_host(host=self.host)
        context = nova.context.get_admin_context()
-
-        # NOTE(danms): this requires some care since conductor
-        # may not be up and fielding requests by the time compute is
-        instances = self.db.instance_get_all_by_host(context, self.host)
+        instances = self._get_instances_at_startup(context)

        if CONF.defer_iptables_apply:
            self.driver.filter_defer_apply_on()
--- a/nova/tests/api/ec2/test_cinder_cloud.py
+++ b/nova/tests/api/ec2/test_cinder_cloud.py
@ -124,6 +124,8 @@ class CinderCloudTestCase(test.TestCase):
        self.flags(use_local=True, group='conductor')

        # set up services
+        self.conductor = self.start_service('conductor',
+                manager=CONF.conductor.manager)
        self.compute = self.start_service('compute')
        self.scheduler = self.start_service('scheduler')
        self.network = self.start_service('network')
--- a/nova/tests/api/ec2/test_cloud.py
+++ b/nova/tests/api/ec2/test_cloud.py
@ -139,6 +139,8 @@ class CloudTestCase(test.TestCase):
        self.flags(use_local=True, group='conductor')

        # set up services
+        self.conductor = self.start_service('conductor',
+                manager=CONF.conductor.manager)
        self.compute = self.start_service('compute')
        self.scheduler = self.start_service('scheduler')
        self.network = self.start_service('network')
@ -730,7 +732,7 @@ class CloudTestCase(test.TestCase):
        result = self.cloud.describe_availability_zones(admin_ctxt,
                                                        zone_name='verbose')

-        self.assertEqual(len(result['availabilityZoneInfo']), 13)
+        self.assertEqual(len(result['availabilityZoneInfo']), 15)
        db.service_destroy(self.context, service1['id'])
        db.service_destroy(self.context, service2['id'])

--- a/nova/tests/api/ec2/test_ec2_validate.py
+++ b/nova/tests/api/ec2/test_ec2_validate.py
@ -52,6 +52,8 @@ class EC2ValidateTestCase(test.TestCase):
        self.cloud = cloud.CloudController()

        # set up services
+        self.conductor = self.start_service('conductor',
+                manager=CONF.conductor.manager)
        self.compute = self.start_service('compute')
        self.scheduter = self.start_service('scheduler')
        self.network = self.start_service('network')
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@ -3095,6 +3095,21 @@ class ComputeTestCase(BaseTestCase):
        instance = self._create_fake_instance(params)
        self.compute._instance_update(self.context, instance['uuid'])

+    def test_startup_conductor_ping(self):
+        timeouts = []
+        calls = dict(count=0)
+
+        def fake_ping(context, message, timeout):
+            timeouts.append(timeout)
+            calls['count'] += 1
+            if calls['count'] < 15:
+                raise rpc_common.Timeout("fake")
+
+        self.stubs.Set(self.compute.conductor_api, 'ping', fake_ping)
+        self.compute._get_instances_at_startup(self.context)
+        self.assertEqual(timeouts.count(10), 10)
+        self.assertTrue(None in timeouts)
+

 class ComputeAPITestCase(BaseTestCase):

--- a/nova/tests/integrated/integrated_helpers.py
+++ b/nova/tests/integrated/integrated_helpers.py
@ -24,6 +24,7 @@ import string
 import uuid

 import nova.image.glance
+from nova.openstack.common import cfg
 from nova.openstack.common.log import logging
 from nova import service
 from nova import test  # For the flags
@ -32,6 +33,7 @@ import nova.tests.image.fake
 from nova.tests.integrated.api import client


+CONF = cfg.CONF
 LOG = logging.getLogger(__name__)


@ -73,12 +75,12 @@ class _IntegratedTestBase(test.TestCase):
                    'chance.ChanceScheduler')

        # set up services
+        self.conductor = self.start_service('conductor',
+                manager=CONF.conductor.manager)
        self.compute = self.start_service('compute')
        self.scheduler = self.start_service('cert')
        self.network = self.start_service('network')
        self.scheduler = self.start_service('scheduler')
-        self.conductor = self.start_service(
-            'conductor', manager='nova.conductor.manager.ConductorManager')

        self._start_api_service()

--- a/nova/tests/network/test_manager.py
+++ b/nova/tests/network/test_manager.py
@ -27,6 +27,7 @@ from nova import exception
 from nova import ipv6
 from nova.network import linux_net
 from nova.network import manager as network_manager
+from nova.openstack.common import cfg
 from nova.openstack.common import importutils
 from nova.openstack.common import log as logging
 from nova.openstack.common import rpc
@ -39,6 +40,7 @@ from nova.tests import matchers
 from nova import utils


+CONF = cfg.CONF
 LOG = logging.getLogger(__name__)


@ -1585,6 +1587,8 @@ class AllocateTestCase(test.TestCase):
    def test_allocate_for_instance(self):
        address = "10.10.10.10"
        self.flags(auto_assign_floating_ip=True)
+        self.conductor = self.start_service(
+            'conductor', manager=CONF.conductor.manager)
        self.compute = self.start_service('compute')
        self.network = self.start_service('network')

--- a/nova/tests/test_test.py
+++ b/nova/tests/test_test.py
@ -18,10 +18,15 @@

 """Tests for the testing base code."""

+from nova.openstack.common import cfg
 from nova.openstack.common import rpc
 from nova import test


+CONF = cfg.CONF
+CONF.import_opt('use_local', 'nova.conductor.api', group='conductor')
+
+
 class IsolationTestCase(test.TestCase):
    """Ensure that things are cleaned up after failed tests.

@ -30,6 +35,7 @@ class IsolationTestCase(test.TestCase):

    """
    def test_service_isolation(self):
+        self.flags(use_local=True, group='conductor')
        self.useFixture(test.ServiceFixture('compute'))

    def test_rpc_consumer_isolation(self):