Coordinator handles ToozError when joining group

While joining partitioning group, we check for MemberAlreadyExist and GroupNotCreated, but fail to account for connection failure scenarios. This leads to ugly stack trace in notification agent logs and bails out before configuring the notification and pipeline listeners. This fix handles the ToozError exception and logs an error message before retrying the join, assuming that the error was a temporary problem. Change-Id: I2aed2241ded798464089b3eec5e1394422a45844 Closes-Bug: 1496982
2015-09-17 23:03:42 +00:00 · 2015-09-17 23:03:42 +00:00 · 09d9325ddf
parent bf3e38085d
commit 09d9325ddf
2 changed files with 35 additions and 1 deletions
--- a/ceilometer/coordination.py
+++ b/ceilometer/coordination.py
@ -130,6 +130,9 @@ class PartitionCoordinator(object):
                    create_grp_req.get()
                except tooz.coordination.GroupAlreadyExist:
                    pass
+            except tooz.coordination.ToozError:
+                LOG.exception(_LE('Error joining partitioning group %s,'
+                                  ' re-trying'), group_id)
        self._groups.add(group_id)

    def leave_group(self, group_id):
--- a/ceilometer/tests/unit/test_coordination.py
+++ b/ceilometer/tests/unit/test_coordination.py
@ -83,6 +83,21 @@ class MockToozCoordExceptionRaiser(MockToozCoordinator):
        raise tooz.coordination.ToozError('error')


+class MockToozCoordExceptionOnJoinRaiser(MockToozCoordinator):
+    def __init__(self, member_id, shared_storage, retry_count=None):
+        super(MockToozCoordExceptionOnJoinRaiser,
+              self).__init__(member_id, shared_storage)
+        self.tooz_error_count = retry_count
+        self.count = 0
+
+    def join_group(self, group_id, capabilities=b''):
+        if self.count == self.tooz_error_count:
+            return MockAsyncResult(None)
+        else:
+            self.count += 1
+            raise tooz.coordination.ToozError('error')
+
+
 class MockAsyncResult(tooz.coordination.CoordAsyncResult):
    def __init__(self, result):
        self.result = result
@ -135,12 +150,14 @@ class TestPartitioning(base.BaseTestCase):
        self.shared_storage = {}

    def _get_new_started_coordinator(self, shared_storage, agent_id=None,
-                                     coordinator_cls=None):
+                                     coordinator_cls=None, retry_count=None):
        coordinator_cls = coordinator_cls or MockToozCoordinator
        self.CONF.set_override('backend_url', 'xxx://yyy',
                               group='coordination')
        with mock.patch('tooz.coordination.get_coordinator',
                        lambda _, member_id:
+                        coordinator_cls(member_id, shared_storage,
+                                        retry_count) if retry_count else
                        coordinator_cls(member_id, shared_storage)):
            pc = coordination.PartitionCoordinator(agent_id)
            pc.start()
@ -210,6 +227,20 @@ class TestPartitioning(base.BaseTestCase):
        for e in expected_errors:
            self.assertIn(e, self.str_handler.messages['error'])

+    def test_coordination_backend_connection_fail_on_join(self):
+        coord = self._get_new_started_coordinator(
+            {'group'}, 'agent1', MockToozCoordExceptionOnJoinRaiser,
+            retry_count=2)
+        with mock.patch('tooz.coordination.get_coordinator',
+                        return_value=MockToozCoordExceptionOnJoinRaiser):
+            coord.join_group(group_id='group')
+
+        expected_errors = ['Error joining partitioning group group,'
+                           ' re-trying',
+                           'Error joining partitioning group group,'
+                           ' re-trying']
+        self.assertEqual(expected_errors, self.str_handler.messages['error'])
+
    def test_reconnect(self):
        coord = self._get_new_started_coordinator({}, 'a',
                                                  MockToozCoordExceptionRaiser)