From 40830c883408387b970454fb9030f5f38b0351ab Mon Sep 17 00:00:00 2001
From: wangyu <wangyu_yewu@cmss.chinamobile.com>
Date: Sun, 27 Sep 2020 17:56:23 +0800
Subject: [PATCH] Fix health policy attach/detach action when
 messaging.MessagingTimeout occurred
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Health policy attach/detach from cluster still successful when send
notification to health manager service with messaging.MessagingTimeout.
This is very bad for the cluster health manager，the following things may happen:
1. When do attach from cluster when messaging.MessagingTimeout occurred，
   Register cluster to health registry will be failed,
   and then the cluster will not do health check, even though
   the policy has attached.
2. When do detach from cluster when messaging.MessagingTimeout occurred,
   Unregister cluster to health registry will be failed,
   and then the cluster will still do health check, even though the policy
   has detached.

Change-Id: Iedf1f0b77e9034ccb81a9073b936875c2259fb10
Closes-Bug: #1897443
---
 senlin/policies/health_policy.py              |  9 +++++++-
 .../tests/unit/policies/test_health_policy.py | 21 +++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/senlin/policies/health_policy.py b/senlin/policies/health_policy.py
index cfa234144..ed1ebfdd2 100644
--- a/senlin/policies/health_policy.py
+++ b/senlin/policies/health_policy.py
@@ -381,7 +381,12 @@ class HealthPolicy(base.Policy):
         detection_mode = {'detection_modes': converted_detection_modes}
         kwargs['params'].update(detection_mode)
 
-        health_manager.register(cluster.id, engine_id=None, **kwargs)
+        ret = health_manager.register(cluster.id, engine_id=None, **kwargs)
+        if not ret:
+            LOG.warning('Registering health manager for cluster %s '
+                        'timed out.', cluster.id)
+            err_msg = _("Registering health manager for cluster timed out.")
+            return False, err_msg
 
         data = {
             'interval': self.interval,
@@ -405,6 +410,8 @@ class HealthPolicy(base.Policy):
         if not ret:
             LOG.warning('Unregistering health manager for cluster %s '
                         'timed out.', cluster.id)
+            err_msg = _("Unregistering health manager for cluster timed out.")
+            return False, err_msg
         return True, ''
 
     def pre_op(self, cluster_id, action, **args):
diff --git a/senlin/tests/unit/policies/test_health_policy.py b/senlin/tests/unit/policies/test_health_policy.py
index e4976aa2f..7ef532cb6 100644
--- a/senlin/tests/unit/policies/test_health_policy.py
+++ b/senlin/tests/unit/policies/test_health_policy.py
@@ -278,12 +278,29 @@ class TestHealthPolicy(base.SenlinTestCase):
         self.assertEqual("Recovery action REBOOT is only applicable to "
                          "os.nova.server clusters.", data)
 
+    @mock.patch.object(health_manager, 'register')
+    def test_attach_failed_with_notify_timeout(self, mock_hm_reg):
+        mock_hm_reg.return_value = False
+        res, data = self.hp.attach(self.cluster)
+        self.assertFalse(res)
+        self.assertEqual("Registering health manager for cluster timed "
+                         "out.", data)
+
     @mock.patch.object(health_manager, 'unregister')
-    def test_detach(self, mock_hm_reg):
+    def test_detach(self, mock_hm_unreg):
         res, data = self.hp.detach(self.cluster)
         self.assertTrue(res)
         self.assertEqual('', data)
-        mock_hm_reg.assert_called_once_with('CLUSTER_ID')
+        mock_hm_unreg.assert_called_once_with('CLUSTER_ID')
+
+    @mock.patch.object(health_manager, 'unregister')
+    def test_detach_failed_with_notify_timeout(self, mock_hm_unreg):
+        mock_hm_unreg.return_value = False
+        res, data = self.hp.detach(self.cluster)
+        self.assertFalse(res)
+        self.assertEqual("Unregistering health manager for cluster timed "
+                         "out.", data)
+        mock_hm_unreg.assert_called_once_with('CLUSTER_ID')
 
     def test_pre_op_default(self):
         action = mock.Mock(context='action_context', data={},