Merge "servicegroup: stop zombie service due to exception"
This commit is contained in:
commit
0b69e8b863
@ -14,20 +14,18 @@
|
||||
# limitations under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_db import exception as db_exception
|
||||
from oslo_log import log as logging
|
||||
import oslo_messaging as messaging
|
||||
from oslo_utils import timeutils
|
||||
import six
|
||||
|
||||
from nova.i18n import _, _LI, _LW
|
||||
from nova.i18n import _, _LI, _LW, _LE
|
||||
from nova.servicegroup import api
|
||||
from nova.servicegroup.drivers import base
|
||||
|
||||
|
||||
CONF = cfg.CONF
|
||||
CONF.import_opt('service_down_time', 'nova.service')
|
||||
CONF.import_opt('use_local', 'nova.conductor.api', group='conductor')
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@ -85,13 +83,6 @@ class DbDriver(base.Driver):
|
||||
def _report_state(self, service):
|
||||
"""Update the state of this service in the datastore."""
|
||||
|
||||
if CONF.conductor.use_local:
|
||||
# need to catch DB type errors
|
||||
exc_cls = db_exception.DBError # oslo.db exception base class
|
||||
else:
|
||||
# need to catch messaging timeouts
|
||||
exc_cls = messaging.MessagingTimeout
|
||||
|
||||
try:
|
||||
service.service_ref.report_count += 1
|
||||
service.service_ref.save()
|
||||
@ -100,12 +91,20 @@ class DbDriver(base.Driver):
|
||||
if getattr(service, 'model_disconnected', False):
|
||||
service.model_disconnected = False
|
||||
LOG.info(
|
||||
_LI('Recovered connection to nova-conductor '
|
||||
'for reporting service status.'))
|
||||
|
||||
# the type of failure depends on use of remote or local conductor
|
||||
except exc_cls:
|
||||
_LI('Recovered from being unable to report status.'))
|
||||
except messaging.MessagingTimeout:
|
||||
# NOTE(johngarbutt) during upgrade we will see messaging timeouts
|
||||
# as nova-conductor is restarted, so only log this error once.
|
||||
if not getattr(service, 'model_disconnected', False):
|
||||
service.model_disconnected = True
|
||||
LOG.warn(_LW('Lost connection to nova-conductor '
|
||||
'for reporting service status.'))
|
||||
except Exception:
|
||||
# NOTE(rpodolyaka): we'd like to avoid catching of all possible
|
||||
# exceptions here, but otherwise it would become possible for
|
||||
# the state reporting thread to stop abruptly, and thus leave
|
||||
# the service unusable until it's restarted.
|
||||
LOG.exception(
|
||||
_LE('Unexpected error while reporting service status'))
|
||||
# trigger the recovery log message, if this error goes away
|
||||
service.model_disconnected = True
|
||||
|
@ -86,6 +86,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
|
||||
fn(service)
|
||||
upd_mock.assert_called_once_with()
|
||||
self.assertEqual(11, service_ref.report_count)
|
||||
self.assertFalse(service.model_disconnected)
|
||||
|
||||
@mock.patch.object(objects.Service, 'save')
|
||||
def _test_report_state_error(self, exc_cls, upd_mock):
|
||||
@ -96,12 +97,23 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
|
||||
service_ref=service_ref)
|
||||
fn = self.servicegroup_api._driver._report_state
|
||||
fn(service) # fail if exception not caught
|
||||
self.assertTrue(service.model_disconnected)
|
||||
|
||||
def test_report_state_remote_error_handling(self):
|
||||
# test error handling using remote conductor
|
||||
self.flags(use_local=False, group='conductor')
|
||||
self._test_report_state_error(messaging.RemoteError)
|
||||
|
||||
def test_report_state_remote_error_handling_timeout(self):
|
||||
# test error handling using remote conductor
|
||||
self.flags(use_local=False, group='conductor')
|
||||
self._test_report_state_error(messaging.MessagingTimeout)
|
||||
|
||||
def test_report_state_remote_unexpected_error(self):
|
||||
# unexpected errors must be handled, but disconnected flag not touched
|
||||
self.flags(use_local=False, group='conductor')
|
||||
self._test_report_state_error(RuntimeError)
|
||||
|
||||
def test_report_state_local_error_handling(self):
|
||||
# if using local conductor, the db driver must handle DB errors
|
||||
self.flags(use_local=True, group='conductor')
|
||||
@ -109,3 +121,8 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
|
||||
# mock an oslo.db DBError as it's an exception base class for
|
||||
# oslo.db DB errors (eg DBConnectionError)
|
||||
self._test_report_state_error(db_exception.DBError)
|
||||
|
||||
def test_report_state_local_unexpected_error(self):
|
||||
# unexpected errors must be handled, but disconnected flag not touched
|
||||
self.flags(use_local=True, group='conductor')
|
||||
self._test_report_state_error(RuntimeError)
|
||||
|
Loading…
Reference in New Issue
Block a user