From 49b0d1741c674714fabf24d8409810064b953202 Mon Sep 17 00:00:00 2001
From: Roman Podoliaka <rpodolyaka@mirantis.com>
Date: Thu, 19 Nov 2015 16:00:01 +0200
Subject: [PATCH] servicegroup: stop zombie service due to exception

If an exception is raised out of the _report_state call, we find that
the service no longer reports any updates to the database, so the
service is considered dead, thus creating a kind of zombie service.

I55417a5b91282c69432bb2ab64441c5cea474d31 seems to introduce a
regression, which leads to nova-* services marked as 'down', if an
error happens in a remote nova-conductor while processing a state
report: only Timeout errors are currently handled, but other errors
are possible, e.g. a DBError (wrapped with RemoteError on RPC
client side), if a DB temporarily goes away. This unhandled exception
will effectively break the state reporting thread - service will be
up again only after restart.

While the intention of I55417a5b91282c69432bb2ab64441c5cea474d31 was
to avoid cathing all the possible exceptions, but it looks like we must
do that to avoid creating a zombie.
The other part of that change was to ensure that during upgrade, we do
not spam the log server about MessagingTimeouts while the
nova-conductors are being restarted. This change ensures that still
happens.

Closes-Bug: #1517926

Change-Id: I44f118f82fbb811b790222face4c74d79795fe21
---
 nova/servicegroup/drivers/db.py               | 29 +++++++++----------
 .../unit/servicegroup/test_db_servicegroup.py | 17 +++++++++++
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/nova/servicegroup/drivers/db.py b/nova/servicegroup/drivers/db.py
index 28471d86f137..99bad8522ed0 100644
--- a/nova/servicegroup/drivers/db.py
+++ b/nova/servicegroup/drivers/db.py
@@ -14,20 +14,18 @@
 # limitations under the License.
 
 from oslo_config import cfg
-from oslo_db import exception as db_exception
 from oslo_log import log as logging
 import oslo_messaging as messaging
 from oslo_utils import timeutils
 import six
 
-from nova.i18n import _, _LI, _LW
+from nova.i18n import _, _LI, _LW, _LE
 from nova.servicegroup import api
 from nova.servicegroup.drivers import base
 
 
 CONF = cfg.CONF
 CONF.import_opt('service_down_time', 'nova.service')
-CONF.import_opt('use_local', 'nova.conductor.api', group='conductor')
 
 LOG = logging.getLogger(__name__)
 
@@ -85,13 +83,6 @@ class DbDriver(base.Driver):
     def _report_state(self, service):
         """Update the state of this service in the datastore."""
 
-        if CONF.conductor.use_local:
-            # need to catch DB type errors
-            exc_cls = db_exception.DBError  # oslo.db exception base class
-        else:
-            # need to catch messaging timeouts
-            exc_cls = messaging.MessagingTimeout
-
         try:
             service.service_ref.report_count += 1
             service.service_ref.save()
@@ -100,12 +91,20 @@ class DbDriver(base.Driver):
             if getattr(service, 'model_disconnected', False):
                 service.model_disconnected = False
                 LOG.info(
-                    _LI('Recovered connection to nova-conductor '
-                        'for reporting service status.'))
-
-        # the type of failure depends on use of remote or local conductor
-        except exc_cls:
+                    _LI('Recovered from being unable to report status.'))
+        except messaging.MessagingTimeout:
+            # NOTE(johngarbutt) during upgrade we will see messaging timeouts
+            # as nova-conductor is restarted, so only log this error once.
             if not getattr(service, 'model_disconnected', False):
                 service.model_disconnected = True
                 LOG.warn(_LW('Lost connection to nova-conductor '
                              'for reporting service status.'))
+        except Exception:
+            # NOTE(rpodolyaka): we'd like to avoid catching of all possible
+            # exceptions here, but otherwise it would become possible for
+            # the state reporting thread to stop abruptly, and thus leave
+            # the service unusable until it's restarted.
+            LOG.exception(
+                _LE('Unexpected error while reporting service status'))
+            # trigger the recovery log message, if this error goes away
+            service.model_disconnected = True
diff --git a/nova/tests/unit/servicegroup/test_db_servicegroup.py b/nova/tests/unit/servicegroup/test_db_servicegroup.py
index b53684d4af88..05f5b25f4dee 100644
--- a/nova/tests/unit/servicegroup/test_db_servicegroup.py
+++ b/nova/tests/unit/servicegroup/test_db_servicegroup.py
@@ -86,6 +86,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
         fn(service)
         upd_mock.assert_called_once_with()
         self.assertEqual(11, service_ref.report_count)
+        self.assertFalse(service.model_disconnected)
 
     @mock.patch.object(objects.Service, 'save')
     def _test_report_state_error(self, exc_cls, upd_mock):
@@ -96,12 +97,23 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
                                  service_ref=service_ref)
         fn = self.servicegroup_api._driver._report_state
         fn(service)  # fail if exception not caught
+        self.assertTrue(service.model_disconnected)
 
     def test_report_state_remote_error_handling(self):
+        # test error handling using remote conductor
+        self.flags(use_local=False, group='conductor')
+        self._test_report_state_error(messaging.RemoteError)
+
+    def test_report_state_remote_error_handling_timeout(self):
         # test error handling using remote conductor
         self.flags(use_local=False, group='conductor')
         self._test_report_state_error(messaging.MessagingTimeout)
 
+    def test_report_state_remote_unexpected_error(self):
+        # unexpected errors must be handled, but disconnected flag not touched
+        self.flags(use_local=False, group='conductor')
+        self._test_report_state_error(RuntimeError)
+
     def test_report_state_local_error_handling(self):
         # if using local conductor, the db driver must handle DB errors
         self.flags(use_local=True, group='conductor')
@@ -109,3 +121,8 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
         # mock an oslo.db DBError as it's an exception base class for
         # oslo.db DB errors (eg DBConnectionError)
         self._test_report_state_error(db_exception.DBError)
+
+    def test_report_state_local_unexpected_error(self):
+        # unexpected errors must be handled, but disconnected flag not touched
+        self.flags(use_local=True, group='conductor')
+        self._test_report_state_error(RuntimeError)