Rework health check code

* Modified health check for node poll status mode to treat a node as healthy if it encounters an error getting server status. * Simplified NodePollStatusHealthCheck code * Added do_healthcheck method separate from do_check to clearly show health check behaviour * Simplified NodePollUrlHealthCheck code by using tenacity * Added more log statements Change-Id: I76f0ef95067c81f123bf548c723e93d4cf9c2d49 Closes-Bug: 1800038
2018-10-31 18:01:31 +00:00 · 2018-10-31 18:01:31 +00:00 · 52d8702274
parent 111ea8eabd
commit 52d8702274
9 changed files with 444 additions and 119 deletions
--- a/senlin/common/consts.py
+++ b/senlin/common/consts.py
@ -317,6 +317,16 @@ LIFECYCLE_TRANSITION_TYPE = (

 VM_STATUS = (
    VS_ACTIVE, VS_ERROR, VS_SUSPENDED, VS_SHUTOFF, VS_PAUSED, VS_RESCUE,
+    VS_DELETED,
 ) = (
-    'ACTIVE', 'ERROR', 'SUSPENDED', 'SHUTOFF', 'PAUSED', 'RESCUE',
+    'ACTIVE', 'ERROR', 'SUSPENDED', 'SHUTOFF', 'PAUSED', 'RESCUE', 'DELETED',
+)
+
+HEALTH_CHECK_MESSAGE = (
+    POLL_STATUS_PASS, POLL_STATUS_FAIL, POLL_URL_PASS, POLL_URL_FAIL,
+) = (
+    'Poll Status health check passed',
+    'Poll Status health check failed',
+    'Poll URL health check passed',
+    'Poll URL health check failed',
 )
--- a/senlin/engine/health_manager.py
+++ b/senlin/engine/health_manager.py
@ -26,6 +26,7 @@ from oslo_service import service
 from oslo_service import threadgroup
 from oslo_utils import timeutils
 import re
+import tenacity
 import time

 from senlin.common import consts
@ -241,6 +242,32 @@ class HealthCheckType(object):
        """
        pass

+    def _node_within_grace_period(self, node):
+        """Check if current time is within the node_update_timeout grace period
+
+        :returns: True if current time is less than node_update_timeout since
+            last node update action. False otherwise.
+        """
+
+        node_last_updated = node.updated_at or node.init_at
+        if timeutils.is_older_than(node_last_updated,
+                                   self.node_update_timeout):
+            # node was last updated more than node_update_timeout seconds ago
+            # -> we are outside the grace period
+            LOG.info("%s was updated at %s which is more "
+                     "than %d secs ago. Mark node as unhealthy.",
+                     node.name, node_last_updated,
+                     self.node_update_timeout)
+            return False
+        else:
+            # node was last updated less than node_update_timeout seconds ago
+            # -> we are inside the grace period
+            LOG.info("%s was updated at %s which is less "
+                     "than %d secs ago. Mark node as healthy.",
+                     node.name, node_last_updated,
+                     self.node_update_timeout)
+            return True
+

 class NodePollStatusHealthCheck(HealthCheckType):
    def run_health_check(self, ctx, node):
@ -248,34 +275,26 @@ class NodePollStatusHealthCheck(HealthCheckType):

        :returns: True if node is healthy. False otherwise.
        """
-
        try:
            # create engine node from db node
            entity = node_mod.Node._from_object(ctx, node)

-            if not entity.do_check(ctx, return_check_result=True):
-                # server was not found as a result of performing check
-                node_last_updated = node.updated_at or node.init_at
-                if not timeutils.is_older_than(
-                        node_last_updated, self.node_update_timeout):
-                    LOG.info("Node %s was updated at %s which is less "
-                             "than %d secs ago. Skip node recovery from "
-                             "NodePollStatusHealthCheck.",
-                             node.id, node_last_updated,
-                             self.node_update_timeout)
-                    return True
-                else:
-                    return False
-            else:
-                LOG.debug("NodePollStatusHealthCheck reports node %s is "
-                          "healthy.", node.id)
-                return True
+            # If health check returns True, return True to mark node as
+            # healthy. Else return True to mark node as healthy if we are still
+            # within the node's grace period to allow the node to warm-up.
+            # Return False to mark the node as unhealthy if we are outside the
+            # grace period.
+
+            return (entity.do_healthcheck(ctx) or
+                    self._node_within_grace_period(node))
        except Exception as ex:
            LOG.warning(
                'Error when performing health check on node %s: %s',
                node.id, ex
            )
-            return False
+
+            # treat node as healthy when an exception is encountered
+            return True


 class NodePollUrlHealthCheck(HealthCheckType):
@ -299,74 +318,88 @@ class NodePollUrlHealthCheck(HealthCheckType):

        return url

-    def run_health_check(self, ctx, node):
-        """Routine to check a node status from a url and recovery if necessary
-
-        :param node: The node to be checked.
-        :returns: True if node is considered to be healthy.  False otherwise.
-        """
-
-        url_template = self.params['poll_url']
+    def _poll_url(self, url, node):
        verify_ssl = self.params['poll_url_ssl_verify']
        conn_error_as_unhealthy = self.params[
            'poll_url_conn_error_as_unhealthy']
        expected_resp_str = self.params['poll_url_healthy_response']
+        retry_interval = self.params['poll_url_retry_interval']
+
+        timeout = max(retry_interval * 0.1, 1)
+
+        try:
+            result = utils.url_fetch(url, timeout=timeout,
+                                     verify=verify_ssl)
+        except Exception as ex:
+            if conn_error_as_unhealthy:
+                LOG.info('%s for %s: connection error when polling URL (%s)',
+                         consts.POLL_URL_FAIL, node.name, ex)
+                return False
+            else:
+                LOG.info('%s for %s: ignoring connection error when polling '
+                         'URL (%s)',
+                         consts.POLL_URL_PASS, node.name, ex)
+                return True
+
+        if not re.search(expected_resp_str, result):
+            LOG.info('%s for %s: did not find expected response string %s in '
+                     'URL result (%s)',
+                     consts.POLL_URL_FAIL, node.name, expected_resp_str,
+                     result)
+            return False
+
+        LOG.info('%s for %s: matched expected response string.',
+                 consts.POLL_URL_PASS, node.name)
+        return True
+
+    def run_health_check(self, ctx, node):
+        """Routine to check a node status from a url and recovery if necessary
+
+        :param node: The node to be checked.
+        :returns: True if node is healthy. False otherwise.
+        """
+
        max_unhealthy_retry = self.params['poll_url_retry_limit']
        retry_interval = self.params['poll_url_retry_interval']

-        def stop_node_recovery():
-            node_last_updated = node.updated_at or node.init_at
-            if not timeutils.is_older_than(
-                    node_last_updated, self.node_update_timeout):
-                LOG.info("Node %s was updated at %s which is less than "
-                         "%d secs ago. Skip node recovery from "
-                         "NodePollUrlHealthCheck.",
-                         node.id, node_last_updated, self.node_update_timeout)
-                return True
+        def _return_last_value(retry_state):
+            return retry_state.outcome.result()

-            LOG.info("Node %s is reported as down (%d retries left)",
-                     node.id, available_attemps)
-            time.sleep(retry_interval)
-
-            return False
-
-        url = self._expand_url_template(url_template, node)
-        LOG.debug("Polling node status from URL: %s", url)
-
-        available_attemps = max_unhealthy_retry
-        timeout = max(retry_interval * 0.1, 1)
-        while available_attemps > 0:
-            available_attemps -= 1
-
-            try:
-                result = utils.url_fetch(
-                    url, timeout=timeout, verify=verify_ssl)
-            except utils.URLFetchError as ex:
-                if conn_error_as_unhealthy:
-                    if stop_node_recovery():
-                        return True
-                    continue
-                else:
-                    LOG.error("Error when requesting node health status from"
-                              " %s: %s", url, ex)
-                    return True
-
-            LOG.debug("Node status returned from URL(%s): %s", url,
-                      result)
-            if re.search(expected_resp_str, result):
-                LOG.debug('NodePollUrlHealthCheck reports node %s is healthy.',
-                          node.id)
-                return True
+        @tenacity.retry(
+            retry=tenacity.retry_if_result(lambda x: x is False),
+            wait=tenacity.wait_fixed(retry_interval),
+            retry_error_callback=_return_last_value,
+            stop=tenacity.stop_after_attempt(max_unhealthy_retry)
+        )
+        def _poll_url_with_retry(url):
+            return self._poll_url(url, node)

+        try:
            if node.status != consts.NS_ACTIVE:
-                LOG.info("Skip node recovery because node %s is not in "
-                         "ACTIVE state.", node.id)
+                LOG.info('%s for %s: node is not in ACTIVE state, so skip '
+                         'poll url',
+                         consts.POLL_URL_PASS, node.name)
                return True

-            if stop_node_recovery():
-                return True
+            url_template = self.params['poll_url']
+            url = self._expand_url_template(url_template, node)

-        return False
+            # If health check returns True, return True to mark node as
+            # healthy. Else return True to mark node as healthy if we are still
+            # within the node's grace period to allow the node to warm-up.
+            # Return False to mark the node as unhealthy if we are outside the
+            # grace period.
+
+            return (_poll_url_with_retry(url) or
+                    self._node_within_grace_period(node))
+        except Exception as ex:
+            LOG.warning(
+                '%s for %s: Ignoring error on poll URL: %s',
+                consts.POLL_URL_PASS, node.name, ex
+            )
+
+            # treat node as healthy when an exception is encountered
+            return True


 class HealthManager(service.Service):
@ -428,8 +461,6 @@ class HealthManager(service.Service):
        :returns: Recover action
        """
        try:
-            LOG.info("%s is requesting node recovery "
-                     "for %s.", self.__class__.__name__, node_id)
            req = objects.NodeRecoverRequest(identity=node_id,
                                             params=recover_action)

@ -516,6 +547,9 @@ class HealthManager(service.Service):
                            recovery_cond))

                if not node_is_healthy:
+                    LOG.info("Health check failed for %s in %s and "
+                             "recovery has started.",
+                             node.name, cluster.name)
                    action = self._recover_node(node.id, ctx,
                                                recover_action)
                    actions.append(action)
@ -529,7 +563,7 @@ class HealthManager(service.Service):
                                "within specified timeout: %s", a['action'],
                                reason)

-            if len(actions) > 0:
+            if len(actions) == 0:
                LOG.info('Health check passed for all nodes in cluster %s.',
                         cluster_id)
        except Exception as ex:
--- a/senlin/engine/node.py
+++ b/senlin/engine/node.py
@ -316,7 +316,7 @@ class Node(object):
        self.index = -1
        return True

-    def do_check(self, context, return_check_result=False):
+    def do_check(self, context):
        if not self.physical_id:
            return False

@ -330,9 +330,6 @@ class Node(object):
            self.set_status(context, consts.NS_ERROR, six.text_type(ex))
            return False

-        if return_check_result:
-            return res
-
        # Physical object is ACTIVE but for some reason the node status in
        # senlin was WARNING. We only update the status_reason
        if res:
@ -350,6 +347,17 @@ class Node(object):

        return True

+    def do_healthcheck(self, context):
+        """health check a node.
+
+        This function is supposed to be invoked from the health manager to
+        check the health of a given node
+        :param context: The request context of the action.
+        :returns: True if node is healthy. False otherwise.
+        """
+
+        return pb.Profile.healthcheck_object(context, self)
+
    def do_recover(self, context, action):
        """recover a node.

@ -358,10 +366,19 @@ class Node(object):
        :param dict options: A map containing the recovery actions (with
            parameters if any) and fencing settings.
        """
-        if not self.physical_id:
+        options = action.inputs
+
+        operations = options.get('operation', [{'name': ''}])
+        reboot_ops = [op for op in operations
+                      if op.get('name') == consts.RECOVER_REBOOT]
+        rebuild_ops = [op for op in operations
+                       if op.get('name') == consts.RECOVER_REBUILD]
+        if not self.physical_id and (reboot_ops or rebuild_ops):
+            # physical id is required for REBOOT or REBUILD operations
+            LOG.warning('Recovery failed because node has no physical id'
+                        ' was provided for reboot or rebuild operation.')
            return False

-        options = action.inputs
        if options.get('check', False):
            res = False
            try:
--- a/senlin/profiles/base.py
+++ b/senlin/profiles/base.py
@ -301,11 +301,13 @@ class Profile(object):
    @profiler.trace('Profile.check_object', hide_args=False)
    def check_object(cls, ctx, obj):
        profile = cls.load(ctx, profile_id=obj.profile_id)
-        try:
-            return profile.do_check(obj)
-        except exc.InternalError as ex:
-            LOG.debug(ex)
-            return False
+        return profile.do_check(obj)
+
+    @classmethod
+    @profiler.trace('Profile.check_object', hide_args=False)
+    def healthcheck_object(cls, ctx, obj):
+        profile = cls.load(ctx, profile_id=obj.profile_id)
+        return profile.do_healthcheck(obj)

    @classmethod
    @profiler.trace('Profile.recover_object', hide_args=False)
@ -461,6 +463,18 @@ class Profile(object):
        LOG.warning("Check operation not supported.")
        return True

+    def do_healthcheck(self, obj):
+        """Default healthcheck operation.
+
+        This is provided as a fallback if a specific profile type does not
+        override this method.
+
+        :param obj: The node object to operate on.
+        :return status: True indicates node is healthy, False indicates
+            it is unhealthy.
+        """
+        return self.do_check(obj)
+
    def do_get_details(self, obj):
        """For subclass to override."""
        LOG.warning("Get_details operation not supported.")
--- a/senlin/profiles/os/nova/server.py
+++ b/senlin/profiles/os/nova/server.py
@ -1532,7 +1532,7 @@ class ServerProfile(base.Profile):
        try:
            server = self.compute(obj).server_get(obj.physical_id)
        except exc.InternalError as ex:
-            if "No Server found" in six.text_type(ex):
+            if ex.code == 404:
                raise exc.EServerNotFound(type='server',
                                          id=obj.physical_id,
                                          message=six.text_type(ex))
@ -1546,6 +1546,60 @@ class ServerProfile(base.Profile):

        return True

+    def do_healthcheck(self, obj):
+        """Healthcheck operation.
+
+        This method checks if a server node is healthy by getting the server
+        status from nova.  A server is considered unhealthy if it does not
+        exist or its status is one of the following:
+        - ERROR
+        - SHUTOFF
+        - DELETED
+
+        :param obj: The node object to operate on.
+        :return status: True indicates node is healthy, False indicates
+            it is unhealthy.
+        """
+        unhealthy_server_status = [consts.VS_ERROR, consts.VS_SHUTOFF,
+                                   consts.VS_DELETED]
+
+        if not obj.physical_id:
+            LOG.info('%s for %s: server has no physical ID.',
+                     consts.POLL_STATUS_FAIL, obj.name)
+            return False
+
+        try:
+            server = self.compute(obj).server_get(obj.physical_id)
+        except Exception as ex:
+            if isinstance(ex, exc.InternalError) and ex.code == 404:
+                # treat resource not found exception as unhealthy
+                LOG.info('%s for %s: server was not found.',
+                         consts.POLL_STATUS_FAIL, obj.name)
+                return False
+            else:
+                # treat all other exceptions as healthy
+                LOG.info(
+                    '%s for %s: Exception when trying to get server info but '
+                    'ignoring this error: %s.',
+                    consts.POLL_STATUS_PASS, obj.name, ex.message)
+                return True
+
+        if server is None:
+            # no server information is available, treat the node as healthy
+            LOG.info(
+                '%s for %s: No server information was returned but ignoring '
+                'this error.',
+                consts.POLL_STATUS_PASS, obj.name)
+            return True
+
+        if server.status in unhealthy_server_status:
+            LOG.info('%s for %s: server status is unhealthy.',
+                     consts.POLL_STATUS_FAIL, obj.name)
+            return False
+
+        LOG.info('%s for %s', consts.POLL_STATUS_PASS, obj.name)
+        return True
+
    def do_recover(self, obj, **options):
        """Handler for recover operation.

--- a/senlin/tests/unit/engine/test_health_manager.py
+++ b/senlin/tests/unit/engine/test_health_manager.py
@ -21,6 +21,7 @@ from oslo_utils import timeutils as tu

 from senlin.common import consts
 from senlin.common import context
+from senlin.common import exception as exc
 from senlin.common import messaging
 from senlin.common import utils
 from senlin.engine import health_manager as hm
@ -618,7 +619,27 @@ class TestNodePollStatusHealthCheck(base.SenlinTestCase):
    @mock.patch.object(tu, 'is_older_than')
    def test_run_health_check_healthy(self, mock_tu, mock_node_obj):
        x_entity = mock.Mock()
-        x_entity.do_check.return_value = True
+        x_entity.do_healthcheck.return_value = True
+        mock_node_obj.return_value = x_entity
+
+        ctx = mock.Mock()
+        node = mock.Mock(id='FAKE_NODE1', status="ERROR",
+                         updated_at='2018-08-13 18:00:00',
+                         init_at='2018-08-13 17:00:00')
+
+        # do it
+        res = self.hc.run_health_check(ctx, node)
+
+        self.assertTrue(res)
+        mock_tu.assert_not_called()
+
+    @mock.patch.object(node_mod.Node, '_from_object')
+    @mock.patch.object(tu, 'is_older_than')
+    def test_run_health_check_healthy_internal_error(
+            self, mock_tu, mock_node_obj):
+        x_entity = mock.Mock()
+        x_entity.do_healthcheck.side_effect = exc.InternalError(
+            message='error')
        mock_node_obj.return_value = x_entity

        ctx = mock.Mock()
@ -636,7 +657,7 @@ class TestNodePollStatusHealthCheck(base.SenlinTestCase):
    @mock.patch.object(tu, 'is_older_than')
    def test_run_health_check_unhealthy(self, mock_tu, mock_node_obj):
        x_entity = mock.Mock()
-        x_entity.do_check.return_value = False
+        x_entity.do_healthcheck.return_value = False
        mock_node_obj.return_value = x_entity

        mock_tu.return_value = True
@ -657,7 +678,7 @@ class TestNodePollStatusHealthCheck(base.SenlinTestCase):
    def test_run_health_check_unhealthy_within_timeout(
            self, mock_tu, mock_node_obj):
        x_entity = mock.Mock()
-        x_entity.do_check.return_value = False
+        x_entity.do_healthcheck.return_value = False
        mock_node_obj.return_value = x_entity

        mock_tu.return_value = False
@ -793,8 +814,7 @@ class TestNodePollUrlHealthCheck(base.SenlinTestCase):
        res = self.hc.run_health_check(ctx, node)

        self.assertTrue(res)
-        mock_url_fetch.assert_called_once_with('FAKE_EXPANDED_URL', timeout=1,
-                                               verify=True)
+        mock_url_fetch.assert_not_called()

    @mock.patch.object(tu, "is_older_than")
    @mock.patch.object(hm.NodePollUrlHealthCheck, "_expand_url_template")
@ -814,8 +834,8 @@ class TestNodePollUrlHealthCheck(base.SenlinTestCase):
        res = self.hc.run_health_check(ctx, node)

        self.assertTrue(res)
-        mock_url_fetch.assert_called_once_with('FAKE_EXPANDED_URL', timeout=1,
-                                               verify=True)
+        mock_url_fetch.assert_has_calls(
+            [mock.call('FAKE_EXPANDED_URL', timeout=1, verify=True)])

    @mock.patch.object(tu, "is_older_than")
    @mock.patch.object(hm.NodePollUrlHealthCheck, "_expand_url_template")
@ -836,17 +856,14 @@ class TestNodePollUrlHealthCheck(base.SenlinTestCase):
        res = self.hc.run_health_check(ctx, node)

        self.assertTrue(res)
-        mock_url_fetch.assert_called_once_with('FAKE_EXPANDED_URL', timeout=1,
-                                               verify=True)
+        mock_url_fetch.assert_has_calls(
+            [mock.call('FAKE_EXPANDED_URL', timeout=1, verify=True)])

-    @mock.patch.object(time, "sleep")
    @mock.patch.object(tu, "is_older_than")
    @mock.patch.object(hm.NodePollUrlHealthCheck, "_expand_url_template")
    @mock.patch.object(utils, 'url_fetch')
-    def test_run_health_check_unhealthy(self,
-                                        mock_url_fetch,
-                                        mock_expand_url, mock_time,
-                                        mock_sleep):
+    def test_run_health_check_unhealthy(self, mock_url_fetch, mock_expand_url,
+                                        mock_time):
        ctx = mock.Mock()
        node = mock.Mock()
        node.status = consts.NS_ACTIVE
@ -865,16 +882,13 @@ class TestNodePollUrlHealthCheck(base.SenlinTestCase):
                mock.call('FAKE_EXPANDED_URL', timeout=1, verify=True)
            ]
        )
-        mock_sleep.assert_has_calls([mock.call(1), mock.call(1)])

-    @mock.patch.object(time, "sleep")
    @mock.patch.object(tu, "is_older_than")
    @mock.patch.object(hm.NodePollUrlHealthCheck, "_expand_url_template")
    @mock.patch.object(utils, 'url_fetch')
    def test_run_health_check_conn_error(self,
                                         mock_url_fetch,
-                                         mock_expand_url, mock_time,
-                                         mock_sleep):
+                                         mock_expand_url, mock_time):
        ctx = mock.Mock()
        node = mock.Mock()
        node.status = consts.NS_ACTIVE
@ -893,15 +907,31 @@ class TestNodePollUrlHealthCheck(base.SenlinTestCase):
                mock.call('FAKE_EXPANDED_URL', timeout=1, verify=True)
            ]
        )
-        mock_sleep.assert_has_calls([mock.call(1), mock.call(1)])

-    @mock.patch.object(time, "sleep")
+    @mock.patch.object(tu, "is_older_than")
+    @mock.patch.object(hm.NodePollUrlHealthCheck, "_expand_url_template")
+    @mock.patch.object(utils, 'url_fetch')
+    def test_run_health_check_conn_other_error(self,
+                                               mock_url_fetch,
+                                               mock_expand_url, mock_time):
+        ctx = mock.Mock()
+        node = mock.Mock()
+        node.status = consts.NS_ACTIVE
+        node.id = 'FAKE_ID'
+        mock_time.return_value = True
+        mock_expand_url.side_effect = Exception('blah')
+
+        # do it
+        res = self.hc.run_health_check(ctx, node)
+
+        self.assertTrue(res)
+        mock_url_fetch.assert_not_called()
+
    @mock.patch.object(tu, "is_older_than")
    @mock.patch.object(hm.NodePollUrlHealthCheck, "_expand_url_template")
    @mock.patch.object(utils, 'url_fetch')
    def test_run_health_check_conn_error_noop(
-            self, mock_url_fetch, mock_expand_url, mock_time,
-            mock_sleep):
+            self, mock_url_fetch, mock_expand_url, mock_time):
        ctx = mock.Mock()
        node = mock.Mock()
        node.status = consts.NS_ACTIVE
@ -921,7 +951,6 @@ class TestNodePollUrlHealthCheck(base.SenlinTestCase):
                mock.call('FAKE_EXPANDED_URL', timeout=1, verify=True),
            ]
        )
-        mock_sleep.assert_not_called()


 class TestHealthManager(base.SenlinTestCase):
--- a/senlin/tests/unit/engine/test_node.py
+++ b/senlin/tests/unit/engine/test_node.py
@ -594,6 +594,24 @@ class TestNode(base.SenlinTestCase):
            % node.physical_id,
            physical_id=None)

+    @mock.patch.object(pb.Profile, 'healthcheck_object')
+    def test_node_healthcheck(self, mock_healthcheck):
+        node = nodem.Node('node1', PROFILE_ID, '')
+        node.status = consts.NS_ACTIVE
+        node.physical_id = 'd94d6333-82e6-4f87-b7ab-b786776df9d1'
+        mock_healthcheck.return_value = True
+        res = node.do_healthcheck(self.context)
+
+        self.assertTrue(res)
+        mock_healthcheck.assert_called_once_with(self.context, node)
+
+    def test_node_healthcheck_no_physical_id(self):
+        node = nodem.Node('node1', PROFILE_ID, '')
+
+        res = node.do_healthcheck(self.context)
+
+        self.assertFalse(res)
+
    @mock.patch.object(nodem.Node, 'set_status')
    @mock.patch.object(pb.Profile, 'recover_object')
    def test_node_recover_new_object(self, mock_recover, mock_status):
@ -611,7 +629,7 @@ class TestNode(base.SenlinTestCase):
        mock_recover.return_value = new_id, True
        mock_status.side_effect = set_status
        action = mock.Mock()
-        action.inputs = {'operation': ['SWIM', 'DANCE']}
+        action.inputs = {'operation': [{'SWIM': 1, 'DANCE': 2}]}

        res = node.do_recover(self.context, action)

@ -793,7 +811,7 @@ class TestNode(base.SenlinTestCase):
            id=node.physical_id,
            reason='Boom!'
        )
-        action = mock.Mock(inputs={'operation': ['boom'],
+        action = mock.Mock(inputs={'operation': [{'boom': 1}],
                                   'check': True})

        res = node.do_recover(self.context, action)
@ -830,14 +848,93 @@ class TestNode(base.SenlinTestCase):
            mock.call(self.context, consts.NS_ERROR,
                      reason='Recovery failed')])

-    def test_node_recover_no_physical_id(self):
+    def test_node_recover_no_physical_id_reboot_op(self):
        node = nodem.Node('node1', PROFILE_ID, None)
-        action = mock.Mock()
+        action = mock.Mock(inputs={'operation': [{'name': 'REBOOT'}]})

        res = node.do_recover(self.context, action)

        self.assertFalse(res)

+    def test_node_recover_no_physical_id_rebuild_op(self):
+        node = nodem.Node('node1', PROFILE_ID, None)
+        action = mock.Mock(inputs={'operation': [{'name': 'REBUILD'}]})
+
+        res = node.do_recover(self.context, action)
+
+        self.assertFalse(res)
+
+    @mock.patch.object(nodem.Node, 'set_status')
+    @mock.patch.object(pb.Profile, 'recover_object')
+    def test_node_recover_no_physical_id_no_op(self, mock_recover,
+                                               mock_status):
+        def set_status(*args, **kwargs):
+            if args[1] == 'ACTIVE':
+                node.physical_id = new_id
+                node.data = {'recovery': 'RECREATE'}
+
+        node = nodem.Node('node1', PROFILE_ID, '', id='fake')
+        new_id = '166db83b-b4a4-49ef-96a8-6c0fdd882d1a'
+        mock_recover.return_value = new_id, True
+        mock_status.side_effect = set_status
+        mock_check = self.patchobject(pb.Profile, 'check_object')
+        mock_check.return_value = False
+        action = mock.Mock(
+            outputs={}, inputs={})
+
+        res = node.do_recover(self.context, action)
+
+        self.assertTrue(res)
+        mock_check.assert_not_called()
+        mock_recover.assert_called_once_with(
+            self.context, node, **action.inputs)
+        self.assertEqual('node1', node.name)
+        self.assertEqual(new_id, node.physical_id)
+        self.assertEqual(PROFILE_ID, node.profile_id)
+        mock_status.assert_has_calls([
+            mock.call(self.context, 'RECOVERING',
+                      reason='Recovery in progress'),
+            mock.call(self.context, consts.NS_ACTIVE,
+                      reason='Recovery succeeded',
+                      physical_id=new_id,
+                      data={'recovery': 'RECREATE'})])
+
+    @mock.patch.object(nodem.Node, 'set_status')
+    @mock.patch.object(pb.Profile, 'recover_object')
+    def test_node_recover_no_physical_id_recreate_op(self, mock_recover,
+                                                     mock_status):
+        def set_status(*args, **kwargs):
+            if args[1] == 'ACTIVE':
+                node.physical_id = new_id
+                node.data = {'recovery': 'RECREATE'}
+
+        node = nodem.Node('node1', PROFILE_ID, '', id='fake')
+        new_id = '166db83b-b4a4-49ef-96a8-6c0fdd882d1a'
+        mock_recover.return_value = new_id, True
+        mock_status.side_effect = set_status
+        mock_check = self.patchobject(pb.Profile, 'check_object')
+        mock_check.return_value = False
+        action = mock.Mock(
+            outputs={}, inputs={'operation': [{'name': 'RECREATE'}],
+                                'check': True})
+
+        res = node.do_recover(self.context, action)
+
+        self.assertTrue(res)
+        mock_check.assert_called_once_with(self.context, node)
+        mock_recover.assert_called_once_with(
+            self.context, node, **action.inputs)
+        self.assertEqual('node1', node.name)
+        self.assertEqual(new_id, node.physical_id)
+        self.assertEqual(PROFILE_ID, node.profile_id)
+        mock_status.assert_has_calls([
+            mock.call(self.context, 'RECOVERING',
+                      reason='Recovery in progress'),
+            mock.call(self.context, consts.NS_ACTIVE,
+                      reason='Recovery succeeded',
+                      physical_id=new_id,
+                      data={'recovery': 'RECREATE'})])
+
    @mock.patch.object(nodem.Node, 'set_status')
    def test_node_recover_operation_not_support(self, mock_set_status):
        node = nodem.Node('node1', PROFILE_ID, None)
--- a/senlin/tests/unit/profiles/test_nova_server.py
+++ b/senlin/tests/unit/profiles/test_nova_server.py
@ -1369,6 +1369,76 @@ class TestNovaServerBasic(base.SenlinTestCase):
                         six.text_type(ex))
        cc.server_get.assert_called_once_with('FAKE_ID')

+    def test_do_healthcheck_active(self):
+        profile = server.ServerProfile('t', self.spec)
+
+        cc = mock.Mock()
+        cc.server_get.return_value = mock.Mock(status='ACTIVE')
+        profile._computeclient = cc
+
+        test_server = mock.Mock(physical_id='FAKE_ID')
+
+        res = profile.do_healthcheck(test_server)
+        cc.server_get.assert_called_once_with('FAKE_ID')
+        self.assertTrue(res)
+
+    def test_do_healthcheck_empty_server_obj(self):
+        profile = server.ServerProfile('t', self.spec)
+
+        cc = mock.Mock()
+        cc.server_get.return_value = None
+        profile._computeclient = cc
+
+        test_server = mock.Mock(physical_id='FAKE_ID')
+
+        res = profile.do_healthcheck(test_server)
+        cc.server_get.assert_called_once_with('FAKE_ID')
+        self.assertTrue(res)
+
+    def test_do_healthcheck_exception(self):
+        profile = server.ServerProfile('t', self.spec)
+
+        cc = mock.Mock()
+        ex = exc.InternalError(code=503, message='Error')
+        cc.server_get.side_effect = ex
+        profile._computeclient = cc
+
+        test_server = mock.Mock(physical_id='FAKE_ID')
+
+        res = profile.do_healthcheck(test_server)
+
+        cc.server_get.assert_called_once_with('FAKE_ID')
+        self.assertTrue(res)
+
+    def test_do_healthcheck_error(self):
+        profile = server.ServerProfile('t', self.spec)
+
+        cc = mock.Mock()
+        cc.server_get.return_value = mock.Mock(status='ERROR')
+        profile._computeclient = cc
+
+        test_server = mock.Mock(physical_id='FAKE_ID')
+
+        res = profile.do_healthcheck(test_server)
+
+        cc.server_get.assert_called_once_with('FAKE_ID')
+        self.assertFalse(res)
+
+    def test_do_healthcheck_server_not_found(self):
+        profile = server.ServerProfile('t', self.spec)
+
+        cc = mock.Mock()
+        ex = exc.InternalError(code=404, message='No Server found')
+        cc.server_get.side_effect = ex
+        profile._computeclient = cc
+
+        test_server = mock.Mock(physical_id='FAKE_ID')
+
+        res = profile.do_healthcheck(test_server)
+
+        cc.server_get.assert_called_once_with('FAKE_ID')
+        self.assertFalse(res)
+
    @mock.patch.object(server.ServerProfile, 'do_delete')
    @mock.patch.object(server.ServerProfile, 'do_create')
    def test_do_recover_operation_is_none(self, mock_create, mock_delete):
--- a/senlin/tests/unit/profiles/test_profile_base.py
+++ b/senlin/tests/unit/profiles/test_profile_base.py
@ -441,10 +441,10 @@ class TestProfileBase(base.SenlinTestCase):
            side_effect=exception.InternalError(code=400, message='BAD'))
        obj = mock_load

-        res = profile.check_object(self.ctx, obj)
+        self.assertRaises(exception.InternalError, profile.check_object,
+                          self.ctx, obj)

        profile.load(self.ctx).do_check.assert_called_once_with(obj)
-        self.assertFalse(res)

    @mock.patch.object(pb.Profile, 'load')
    def test_update_object_with_profile(self, mock_load):