Merge "servicegroup: stop zombie service due to exception"

This commit is contained in:
Jenkins 2015-12-01 00:14:10 +00:00 committed by Gerrit Code Review
commit 0b69e8b863
2 changed files with 31 additions and 15 deletions

View File

@ -14,20 +14,18 @@
# limitations under the License. # limitations under the License.
from oslo_config import cfg from oslo_config import cfg
from oslo_db import exception as db_exception
from oslo_log import log as logging from oslo_log import log as logging
import oslo_messaging as messaging import oslo_messaging as messaging
from oslo_utils import timeutils from oslo_utils import timeutils
import six import six
from nova.i18n import _, _LI, _LW from nova.i18n import _, _LI, _LW, _LE
from nova.servicegroup import api from nova.servicegroup import api
from nova.servicegroup.drivers import base from nova.servicegroup.drivers import base
CONF = cfg.CONF CONF = cfg.CONF
CONF.import_opt('service_down_time', 'nova.service') CONF.import_opt('service_down_time', 'nova.service')
CONF.import_opt('use_local', 'nova.conductor.api', group='conductor')
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@ -85,13 +83,6 @@ class DbDriver(base.Driver):
def _report_state(self, service): def _report_state(self, service):
"""Update the state of this service in the datastore.""" """Update the state of this service in the datastore."""
if CONF.conductor.use_local:
# need to catch DB type errors
exc_cls = db_exception.DBError # oslo.db exception base class
else:
# need to catch messaging timeouts
exc_cls = messaging.MessagingTimeout
try: try:
service.service_ref.report_count += 1 service.service_ref.report_count += 1
service.service_ref.save() service.service_ref.save()
@ -100,12 +91,20 @@ class DbDriver(base.Driver):
if getattr(service, 'model_disconnected', False): if getattr(service, 'model_disconnected', False):
service.model_disconnected = False service.model_disconnected = False
LOG.info( LOG.info(
_LI('Recovered connection to nova-conductor ' _LI('Recovered from being unable to report status.'))
'for reporting service status.')) except messaging.MessagingTimeout:
# NOTE(johngarbutt) during upgrade we will see messaging timeouts
# the type of failure depends on use of remote or local conductor # as nova-conductor is restarted, so only log this error once.
except exc_cls:
if not getattr(service, 'model_disconnected', False): if not getattr(service, 'model_disconnected', False):
service.model_disconnected = True service.model_disconnected = True
LOG.warn(_LW('Lost connection to nova-conductor ' LOG.warn(_LW('Lost connection to nova-conductor '
'for reporting service status.')) 'for reporting service status.'))
except Exception:
# NOTE(rpodolyaka): we'd like to avoid catching of all possible
# exceptions here, but otherwise it would become possible for
# the state reporting thread to stop abruptly, and thus leave
# the service unusable until it's restarted.
LOG.exception(
_LE('Unexpected error while reporting service status'))
# trigger the recovery log message, if this error goes away
service.model_disconnected = True

View File

@ -86,6 +86,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
fn(service) fn(service)
upd_mock.assert_called_once_with() upd_mock.assert_called_once_with()
self.assertEqual(11, service_ref.report_count) self.assertEqual(11, service_ref.report_count)
self.assertFalse(service.model_disconnected)
@mock.patch.object(objects.Service, 'save') @mock.patch.object(objects.Service, 'save')
def _test_report_state_error(self, exc_cls, upd_mock): def _test_report_state_error(self, exc_cls, upd_mock):
@ -96,12 +97,23 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
service_ref=service_ref) service_ref=service_ref)
fn = self.servicegroup_api._driver._report_state fn = self.servicegroup_api._driver._report_state
fn(service) # fail if exception not caught fn(service) # fail if exception not caught
self.assertTrue(service.model_disconnected)
def test_report_state_remote_error_handling(self): def test_report_state_remote_error_handling(self):
# test error handling using remote conductor
self.flags(use_local=False, group='conductor')
self._test_report_state_error(messaging.RemoteError)
def test_report_state_remote_error_handling_timeout(self):
# test error handling using remote conductor # test error handling using remote conductor
self.flags(use_local=False, group='conductor') self.flags(use_local=False, group='conductor')
self._test_report_state_error(messaging.MessagingTimeout) self._test_report_state_error(messaging.MessagingTimeout)
def test_report_state_remote_unexpected_error(self):
# unexpected errors must be handled, but disconnected flag not touched
self.flags(use_local=False, group='conductor')
self._test_report_state_error(RuntimeError)
def test_report_state_local_error_handling(self): def test_report_state_local_error_handling(self):
# if using local conductor, the db driver must handle DB errors # if using local conductor, the db driver must handle DB errors
self.flags(use_local=True, group='conductor') self.flags(use_local=True, group='conductor')
@ -109,3 +121,8 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
# mock an oslo.db DBError as it's an exception base class for # mock an oslo.db DBError as it's an exception base class for
# oslo.db DB errors (eg DBConnectionError) # oslo.db DB errors (eg DBConnectionError)
self._test_report_state_error(db_exception.DBError) self._test_report_state_error(db_exception.DBError)
def test_report_state_local_unexpected_error(self):
# unexpected errors must be handled, but disconnected flag not touched
self.flags(use_local=True, group='conductor')
self._test_report_state_error(RuntimeError)