Handle ServiceNotFound in DbDriver._report_state

If a service record is gone from the DB the _report_state method
will log a generic traceback every time the report interval runs,
which is every 10 seconds by default. This gets pretty noisy and
the error logged isn't very useful. One could get into this state
by deleting the service record in the API before stopping the actual
process that is running.

This simply handles the ServiceNotFound error and logs a more useful
error message without the noisy traceback.

Change-Id: If0336001fbe39922a199756db0803121cbe560af
Related-Bug: #1813147
This commit is contained in:
Matt Riedemann 2019-12-04 09:50:17 -05:00
parent 1c2b7d8f01
commit 945d662d4f
2 changed files with 21 additions and 1 deletions

View File

@ -19,6 +19,7 @@ from oslo_utils import timeutils
import six import six
import nova.conf import nova.conf
from nova import exception
from nova.i18n import _, _LI, _LW, _LE from nova.i18n import _, _LI, _LW, _LE
from nova.servicegroup import api from nova.servicegroup import api
from nova.servicegroup.drivers import base from nova.servicegroup.drivers import base
@ -103,6 +104,16 @@ class DbDriver(base.Driver):
service.model_disconnected = True service.model_disconnected = True
LOG.warning(_LW('Lost connection to nova-conductor ' LOG.warning(_LW('Lost connection to nova-conductor '
'for reporting service status.')) 'for reporting service status.'))
except exception.ServiceNotFound:
# The service may have been deleted via the API but the actual
# process is still running. Provide a useful error message rather
# than the noisy traceback in the generic Exception block below.
LOG.error('The services table record for the %s service on '
'host %s is gone. You either need to stop this service '
'if it should be deleted or restart it to recreate the '
'record in the database.',
service.service_ref.binary, service.service_ref.host)
service.model_disconnected = True
except Exception: except Exception:
# NOTE(rpodolyaka): we'd like to avoid catching of all possible # NOTE(rpodolyaka): we'd like to avoid catching of all possible
# exceptions here, but otherwise it would become possible for # exceptions here, but otherwise it would become possible for

View File

@ -17,6 +17,7 @@ import oslo_messaging as messaging
from oslo_utils import fixture as utils_fixture from oslo_utils import fixture as utils_fixture
from oslo_utils import timeutils from oslo_utils import timeutils
from nova import exception
from nova import objects from nova import objects
from nova import servicegroup from nova import servicegroup
from nova import test from nova import test
@ -95,12 +96,13 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
def _test_report_state_error(self, exc_cls, upd_mock): def _test_report_state_error(self, exc_cls, upd_mock):
upd_mock.side_effect = exc_cls("service save failed") upd_mock.side_effect = exc_cls("service save failed")
service_ref = objects.Service(host='fake-host', topic='compute', service_ref = objects.Service(host='fake-host', topic='compute',
report_count=10) report_count=10, binary='nova-compute')
service = mock.MagicMock(model_disconnected=False, service = mock.MagicMock(model_disconnected=False,
service_ref=service_ref) service_ref=service_ref)
fn = self.servicegroup_api._driver._report_state fn = self.servicegroup_api._driver._report_state
fn(service) # fail if exception not caught fn(service) # fail if exception not caught
self.assertTrue(service.model_disconnected) self.assertTrue(service.model_disconnected)
return service_ref
def test_report_state_error_handling_timeout(self): def test_report_state_error_handling_timeout(self):
self._test_report_state_error(messaging.MessagingTimeout) self._test_report_state_error(messaging.MessagingTimeout)
@ -108,6 +110,13 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
def test_report_state_unexpected_error(self): def test_report_state_unexpected_error(self):
self._test_report_state_error(RuntimeError) self._test_report_state_error(RuntimeError)
def test_report_state_service_not_found(self):
service_ref = self._test_report_state_error(exception.ServiceNotFound)
self.assertIn('The services table record for the %s service on '
'host %s is gone.' %
(service_ref.binary, service_ref.host),
self.stdlog.logger.output)
def test_get_updated_time(self): def test_get_updated_time(self):
retval = "2016-11-02T22:40:31.000000" retval = "2016-11-02T22:40:31.000000"
service_ref = { service_ref = {