diff --git a/nova/compute/api.py b/nova/compute/api.py index ccf428813a34..b1efbb2215d5 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -106,6 +106,7 @@ AGGREGATE_ACTION_ADD = 'Add' MIN_COMPUTE_TRUSTED_CERTS = 31 MIN_COMPUTE_ABORT_QUEUED_LIVE_MIGRATION = 34 MIN_COMPUTE_VOLUME_TYPE = 36 +MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38 # FIXME(danms): Keep a global cache of the cells we find the # first time we look. This needs to be refreshed on a timer or @@ -5047,17 +5048,69 @@ class HostAPI(base.Base): """Get service entry for the given compute hostname.""" return objects.Service.get_by_compute_host(context, host_name) + def _update_compute_provider_status(self, context, service): + """Calls the compute service to sync the COMPUTE_STATUS_DISABLED trait. + + There are two cases where the API will not call the compute service: + + * The compute service is down. In this case the trait is synchronized + when the compute service is restarted. + * The compute service is old. In this case the trait is synchronized + when the compute service is upgraded and restarted. + + :param context: nova auth RequestContext + :param service: nova.objects.Service object which has been enabled + or disabled (see ``service_update``). + """ + # Make sure the service is up so we can make the RPC call. + if not self.servicegroup_api.service_is_up(service): + LOG.info('Compute service on host %s is down. The ' + 'COMPUTE_STATUS_DISABLED trait will be synchronized ' + 'when the service is restarted.', service.host) + return + + # Make sure the compute service is new enough for the trait sync + # behavior. + # TODO(mriedem): Remove this compat check in the U release. + if service.version < MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED: + LOG.info('Compute service on host %s is too old to sync the ' + 'COMPUTE_STATUS_DISABLED trait in Placement. The ' + 'trait will be synchronized when the service is ' + 'upgraded and restarted.', service.host) + return + + enabled = not service.disabled + # Avoid leaking errors out of the API. + try: + LOG.debug('Calling the compute service on host %s to sync the ' + 'COMPUTE_STATUS_DISABLED trait.', service.host) + self.rpcapi.set_host_enabled(context, service.host, enabled) + except Exception: + LOG.exception('An error occurred while updating host enabled ' + 'status to "%s" for compute host: %s', + 'enabled' if enabled else 'disabled', + service.host) + def service_update(self, context, service): """Performs the actual service update operation. + If the "disabled" field is changed, potentially calls the compute + service to sync the COMPUTE_STATUS_DISABLED trait on the compute node + resource providers managed by this compute service. + :param context: nova auth RequestContext :param service: nova.objects.Service object with changes already set on the object """ + # Before persisting changes and resetting the changed fields on the + # Service object, determine if the disabled field changed. + update_placement = 'disabled' in service.obj_what_changed() + # Persist the Service object changes to the database. service.save() - # TODO(mriedem): Reflect COMPUTE_STATUS_DISABLED trait changes to the - # associated compute node resource providers if the service's disabled - # status changed. + # If the disabled field changed, potentially call the compute service + # to sync the COMPUTE_STATUS_DISABLED trait. + if update_placement: + self._update_compute_provider_status(context, service) return service @target_host_cell diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 3be891d21d0d..2106ff545890 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -913,7 +913,9 @@ class ComputeAPI(object): def set_host_enabled(self, ctxt, host, enabled): version = '5.0' cctxt = self.router.client(ctxt).prepare( - server=host, version=version) + server=host, version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) return cctxt.call(ctxt, 'set_host_enabled', enabled=enabled) def swap_volume(self, ctxt, instance, old_volume_id, new_volume_id, diff --git a/nova/conf/rpc.py b/nova/conf/rpc.py index e5bd1b989861..8dae38aad2b4 100644 --- a/nova/conf/rpc.py +++ b/nova/conf/rpc.py @@ -28,6 +28,7 @@ Operations with RPC calls that utilize this value: * live migration * scheduling +* enabling/disabling a compute service Related options: diff --git a/nova/tests/functional/api_sample_tests/test_services.py b/nova/tests/functional/api_sample_tests/test_services.py index 68fc2c4e1243..7f6377d8bb57 100644 --- a/nova/tests/functional/api_sample_tests/test_services.py +++ b/nova/tests/functional/api_sample_tests/test_services.py @@ -37,6 +37,12 @@ class ServicesJsonTest(api_sample_base.ApiSampleTestBaseV21): test_services.fake_service_get_by_host_binary) self.stub_out("nova.db.api.service_update", test_services.fake_service_update) + # If we are not using real services, we need to stub out + # HostAPI._update_compute_provider_status so we don't actually + # try to call a fake service over RPC. + self.stub_out('nova.compute.api.HostAPI.' + '_update_compute_provider_status', + lambda *args, **kwargs: None) self.useFixture(utils_fixture.TimeFixture(test_services.fake_utcnow())) def test_services_list(self): diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index 3afe81d8c30a..607100ad2b0f 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -11,6 +11,7 @@ # under the License. import os_resource_classes as orc +import os_traits import six from nova import context as nova_context @@ -18,6 +19,8 @@ from nova import exception from nova import objects from nova.tests.functional.api import client as api_client from nova.tests.functional import integrated_helpers +from nova.tests.unit.image import fake as fake_image +from nova import utils class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): @@ -171,3 +174,132 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): log_output = self.stdlog.logger.output self.assertIn('Error updating resources for node host1.', log_output) self.assertIn('Failed to create resource provider host1', log_output) + + +class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase): + """Tests the API, compute service and Placement interaction with the + COMPUTE_STATUS_DISABLED trait when a compute service is enable/disabled. + + This version of the test uses the 2.latest microversion for testing the + 2.53+ behavior of the PUT /os-services/{service_id} API. + """ + compute_driver = 'fake.SmallFakeDriver' + + def _update_service(self, service, disabled, forced_down=None): + """Update the service using the 2.53 request schema. + + :param service: dict representing the service resource in the API + :param disabled: True if the service should be disabled, False if the + service should be enabled + :param forced_down: Optionally change the forced_down value. + """ + status = 'disabled' if disabled else 'enabled' + req = {'status': status} + if forced_down is not None: + req['forced_down'] = forced_down + self.admin_api.put_service(service['id'], req) + + def test_compute_status_filter(self): + """Tests the compute_status_filter placement request filter""" + # Start a compute service so a compute node and resource provider is + # created. + compute = self._start_compute('host1') + # Get the UUID of the resource provider that was created. + rp_uuid = self._get_provider_uuid_by_host('host1') + # Get the service from the compute API. + services = self.admin_api.get_services(binary='nova-compute', + host='host1') + self.assertEqual(1, len(services)) + service = services[0] + + # At this point, the service should be enabled and the + # COMPUTE_STATUS_DISABLED trait should not be set on the + # resource provider in placement. + self.assertEqual('enabled', service['status']) + rp_traits = self._get_provider_traits(rp_uuid) + trait = os_traits.COMPUTE_STATUS_DISABLED + self.assertNotIn(trait, rp_traits) + + # Now disable the compute service via the API. + self._update_service(service, disabled=True) + + # The update to placement should be synchronous so check the provider + # traits and COMPUTE_STATUS_DISABLED should be set. + rp_traits = self._get_provider_traits(rp_uuid) + self.assertIn(trait, rp_traits) + + # Try creating a server which should fail because nothing is available. + networks = [{'port': self.neutron.port_1['id']}] + server_req = self._build_minimal_create_server_request( + self.api, 'test_compute_status_filter', + image_uuid=fake_image.get_valid_image_id(), networks=networks) + server = self.api.post_server({'server': server_req}) + server = self._wait_for_state_change(self.api, server, 'ERROR') + # There should be a NoValidHost fault recorded. + self.assertIn('fault', server) + self.assertIn('No valid host', server['fault']['message']) + + # Now enable the service and the trait should be gone. + self._update_service(service, disabled=False) + rp_traits = self._get_provider_traits(rp_uuid) + self.assertNotIn(trait, rp_traits) + + # Try creating another server and it should be OK. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + + # Stop, force-down and disable the service so the API cannot call + # the compute service to sync the trait. + compute.stop() + self._update_service(service, disabled=True, forced_down=True) + # The API should have logged a message about the service being down. + self.assertIn('Compute service on host host1 is down. The ' + 'COMPUTE_STATUS_DISABLED trait will be synchronized ' + 'when the service is restarted.', + self.stdlog.logger.output) + # The trait should not be on the provider even though the node is + # disabled. + rp_traits = self._get_provider_traits(rp_uuid) + self.assertNotIn(trait, rp_traits) + # Restart the compute service which should sync and set the trait on + # the provider in placement. + self.restart_compute_service(compute) + rp_traits = self._get_provider_traits(rp_uuid) + self.assertIn(trait, rp_traits) + + +class ComputeStatusFilterTest211(ComputeStatusFilterTest): + """Extends ComputeStatusFilterTest and uses the 2.11 API for the + legacy os-services disable/enable/force-down API behavior + """ + microversion = '2.11' + + def _update_service(self, service, disabled, forced_down=None): + """Update the service using the 2.11 request schema. + + :param service: dict representing the service resource in the API + :param disabled: True if the service should be disabled, False if the + service should be enabled + :param forced_down: Optionally change the forced_down value. + """ + # Before 2.53 the service is uniquely identified by host and binary. + body = { + 'host': service['host'], + 'binary': service['binary'] + } + # Handle forced_down first if provided since the enable/disable + # behavior in the API depends on it. + if forced_down is not None: + body['forced_down'] = forced_down + self.admin_api.api_put('/os-services/force-down', body) + + if disabled: + self.admin_api.api_put('/os-services/disable', body) + else: + self.admin_api.api_put('/os-services/enable', body) + + def _get_provider_uuid_by_host(self, host): + # We have to temporarily mutate to 2.53 to get the hypervisor UUID. + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + return super(ComputeStatusFilterTest211, + self)._get_provider_uuid_by_host(host) diff --git a/nova/tests/unit/compute/test_host_api.py b/nova/tests/unit/compute/test_host_api.py index f53d438ba5aa..26c7d1d1491a 100644 --- a/nova/tests/unit/compute/test_host_api.py +++ b/nova/tests/unit/compute/test_host_api.py @@ -16,6 +16,7 @@ import fixtures import mock +import oslo_messaging as messaging from oslo_utils.fixture import uuidsentinel as uuids from nova.api.openstack.compute import services @@ -327,18 +328,73 @@ class ComputeHostAPITestCase(test.TestCase): service_id = 42 expected_result = dict(test_service.fake_service, id=service_id) + @mock.patch.object(self.host_api, '_update_compute_provider_status') @mock.patch.object(self.host_api.db, 'service_get_by_host_and_binary') @mock.patch.object(self.host_api.db, 'service_update') - def _do_test(mock_service_update, mock_service_get_by_host_and_binary): + def _do_test(mock_service_update, mock_service_get_by_host_and_binary, + mock_update_compute_provider_status): mock_service_get_by_host_and_binary.return_value = expected_result mock_service_update.return_value = expected_result result = self.host_api.service_update_by_host_and_binary( self.ctxt, host_name, binary, params_to_update) self._compare_obj(result, expected_result) + mock_update_compute_provider_status.assert_called_once_with( + self.ctxt, test.MatchType(objects.Service)) _do_test() + @mock.patch('nova.compute.api.HostAPI._update_compute_provider_status', + new_callable=mock.NonCallableMock) + def test_service_update_no_update_provider_status(self, mock_ucps): + """Tests the scenario that the service is updated but the disabled + field is not changed, for example the forced_down field is only + updated. In this case _update_compute_provider_status should not be + called. + """ + service = objects.Service(forced_down=True) + self.assertIn('forced_down', service.obj_what_changed()) + with mock.patch.object(service, 'save') as mock_save: + retval = self.host_api.service_update(self.ctxt, service) + self.assertIs(retval, service) + mock_save.assert_called_once_with() + + @mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled', + new_callable=mock.NonCallableMock) + def test_update_compute_provider_status_service_too_old(self, mock_she): + """Tests the scenario that the service is up but is too old to sync the + COMPUTE_STATUS_DISABLED trait. + """ + service = objects.Service(host='fake-host') + service.version = compute.MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED - 1 + with mock.patch.object( + self.host_api.servicegroup_api, 'service_is_up', + return_value=True) as service_is_up: + self.host_api._update_compute_provider_status(self.ctxt, service) + service_is_up.assert_called_once_with(service) + self.assertIn('Compute service on host fake-host is too old to sync ' + 'the COMPUTE_STATUS_DISABLED trait in Placement.', + self.stdlog.logger.output) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled', + side_effect=messaging.MessagingTimeout) + def test_update_compute_provider_status_service_rpc_error(self, mock_she): + """Tests the scenario that the RPC call to the compute service raised + some exception. + """ + service = objects.Service(host='fake-host', disabled=True) + with mock.patch.object( + self.host_api.servicegroup_api, 'service_is_up', + return_value=True) as service_is_up: + self.host_api._update_compute_provider_status(self.ctxt, service) + service_is_up.assert_called_once_with(service) + mock_she.assert_called_once_with(self.ctxt, 'fake-host', False) + log_output = self.stdlog.logger.output + self.assertIn('An error occurred while updating host enabled ' + 'status to "disabled" for compute host: fake-host', + log_output) + self.assertIn('MessagingTimeout', log_output) + @mock.patch.object(objects.InstanceList, 'get_by_host', return_value = ['fake-responses']) def test_instance_get_all_by_host(self, mock_get): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 5dd86a7b8afb..68da4d20fb31 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -468,8 +468,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): version='5.0') def test_set_host_enabled(self): + self.flags(long_rpc_timeout=600, rpc_response_timeout=120) self._test_compute_api('set_host_enabled', 'call', - enabled='enabled', host='host') + enabled='enabled', host='host', + call_monitor_timeout=120, timeout=600) def test_get_host_uptime(self): self._test_compute_api('get_host_uptime', 'call', host='host')