Do not override RESTART_REQUIRED service status

The RESTART_REQUIRED service status set by guest agent should not be overridden by the health heartbeat mechanism. The RESTART_REQUIRED can only be changed when the instance is restarted or rebooted. Story: 2008612 Task: 41795 Change-Id: I98baf252452353237bc8fb14357df4e7bcb2867a
2021-02-10 09:53:06 +13:00 · 2021-02-10 09:53:06 +13:00 · 9c2e0bf3a0
commit 9c2e0bf3a0
parent 6edd3332b6
5 changed files with 138 additions and 43 deletions
--- a/trove/conductor/manager.py
+++ b/trove/conductor/manager.py
@ -24,7 +24,7 @@ from trove.common.serializable_notification import SerializableNotification
 from trove.conductor.models import LastSeen
 from trove.extensions.mysql import models as mysql_models
 from trove.instance import models as inst_models
-from trove.instance.service_status import ServiceStatus
+from trove.instance import service_status as svc_status

 LOG = logging.getLogger(__name__)
 CONF = cfg.CONF
@ -86,11 +86,20 @@ class Manager(periodic_task.PeriodicTasks):
                   "payload": str(payload)})
        status = inst_models.InstanceServiceStatus.find_by(
            instance_id=instance_id)
+
        if self._message_too_old(instance_id, 'heartbeat', sent):
            return
+
+        if status.get_status() == svc_status.ServiceStatuses.RESTART_REQUIRED:
+            LOG.debug("Instance %s service status is RESTART_REQUIRED, "
+                      "skip heartbeat", instance_id)
+            return
+
        if payload.get('service_status') is not None:
            status.set_status(
-                ServiceStatus.from_description(payload['service_status']))
+                svc_status.ServiceStatus.from_description(
+                    payload['service_status'])
+            )
        status.save()

    def update_backup(self, context, instance_id, backup_id,
--- a/trove/guestagent/guest_log.py
+++ b/trove/guestagent/guest_log.py
@ -15,6 +15,7 @@
 import enum
 import hashlib
 import os
+from pathlib import Path
 from requests.exceptions import ConnectionError

 from oslo_log import log as logging
@ -245,34 +246,36 @@ class GuestLog(object):
                   'published': self._published_size})

    def _update_details(self):
-        # Make sure we can read the file
-        if not self._file_readable or not os.access(self._file, os.R_OK):
-            if not os.access(self._file, os.R_OK):
-                if operating_system.exists(self._file, as_root=True):
-                    operating_system.chmod(
-                        self._file, FileMode.ADD_ALL_R, as_root=True)
-            self._file_readable = True
+        if operating_system.exists(self._file, as_root=True):
+            file_path = Path(self._file)

-        if os.path.isfile(self._file):
-            logstat = os.stat(self._file)
-            self._size = logstat.st_size
+            # Make sure guest agent can read the log file.
+            if not os.access(self._file, os.R_OK):
+                operating_system.chmod(self._file, FileMode.ADD_ALL_R,
+                                       as_root=True)
+                operating_system.chmod(str(file_path.parent),
+                                       FileMode.ADD_GRP_RX_OTH_RX,
+                                       as_root=True)
+
+            self._size = file_path.stat().st_size
            self._update_log_header_digest(self._file)

            if self.status != LogStatus.Disabled:
                if self._log_rotated():
                    self.status = LogStatus.Rotated
                # See if we have stuff to publish
-                elif logstat.st_size > self._published_size:
+                elif self._size > self._published_size:
                    self._set_status(self._published_size,
                                     LogStatus.Partial, LogStatus.Ready)
                # We've published everything so far
-                elif logstat.st_size == self._published_size:
+                elif self._size == self._published_size:
                    self._set_status(self._published_size,
                                     LogStatus.Published, LogStatus.Enabled)
                # We've already handled this case (log rotated) so what gives?
                else:
                    raise Exception(_("Bug in _log_rotated ?"))
        else:
+            LOG.warning(f"File {self._file} does not exist")
            self._published_size = 0
            self._size = 0

--- a/trove/instance/models.py
+++ b/trove/instance/models.py
@ -607,23 +607,30 @@ def load_instance(cls, context, id, needs_server=False,
    return cls(context, db_info, server, service_status)


-def load_instance_with_info(cls, context, id, cluster_id=None):
-    db_info = get_db_info(context, id, cluster_id)
-
-    service_status = InstanceServiceStatus.find_by(instance_id=id)
-    if (db_info.task_status == InstanceTasks.NONE and
-            not service_status.is_uptodate()):
-        LOG.warning('Guest agent heartbeat for instance %s has expried', id)
+def update_service_status(task_status, service_status, ins_id):
+    """Update service status as needed."""
+    if (task_status == InstanceTasks.NONE and
+        service_status.status != srvstatus.ServiceStatuses.RESTART_REQUIRED and
+        not service_status.is_uptodate()):
+        LOG.warning('Guest agent heartbeat for instance %s has expried',
+                    ins_id)
        service_status.status = \
            srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT

+
+def load_instance_with_info(cls, context, ins_id, cluster_id=None):
+    db_info = get_db_info(context, ins_id, cluster_id)
+    service_status = InstanceServiceStatus.find_by(instance_id=ins_id)
+
+    update_service_status(db_info.task_status, service_status, ins_id)
+
    load_simple_instance_server_status(context, db_info)

    load_simple_instance_addresses(context, db_info)

    instance = cls(context, db_info, service_status)

-    load_guest_info(instance, context, id)
+    load_guest_info(instance, context, ins_id)

    load_server_group_info(instance, context)

@ -901,6 +908,11 @@ class BaseInstance(SimpleInstance):
        del_instance.set_status(srvstatus.ServiceStatuses.DELETED)
        del_instance.save()

+    def set_servicestatus_restart(self):
+        del_instance = InstanceServiceStatus.find_by(instance_id=self.id)
+        del_instance.set_status(srvstatus.ServiceStatuses.RESTARTING)
+        del_instance.save()
+
    def set_instance_fault_deleted(self):
        try:
            del_fault = DBInstanceFault.find_by(instance_id=self.id)
@ -1442,7 +1454,10 @@ class Instance(BuiltInstance):
        LOG.info("Rebooting instance %s.", self.id)
        if self.db_info.cluster_id is not None and not self.context.is_admin:
            raise exception.ClusterInstanceOperationNotSupported()
+
        self.update_db(task_status=InstanceTasks.REBOOTING)
+        self.set_servicestatus_restart()
+
        task_api.API(self.context).reboot(self.id)

    def restart(self):
@ -1450,13 +1465,10 @@ class Instance(BuiltInstance):
        LOG.info("Restarting datastore on instance %s.", self.id)
        if self.db_info.cluster_id is not None and not self.context.is_admin:
            raise exception.ClusterInstanceOperationNotSupported()
-        # Set our local status since Nova might not change it quick enough.
-        # TODO(tim.simpson): Possible bad stuff can happen if this service
-        #                   shuts down before it can set status to NONE.
-        #                   We need a last updated time to mitigate this;
-        #                   after some period of tolerance, we'll assume the
-        #                   status is no longer in effect.
+
        self.update_db(task_status=InstanceTasks.REBOOTING)
+        self.set_servicestatus_restart()
+
        task_api.API(self.context).restart(self.id)

    def detach_replica(self):
@ -1829,9 +1841,9 @@ class Instances(object):
                        db.server_status = "SHUTDOWN"  # Fake it...
                        db.addresses = []

-                datastore_status = InstanceServiceStatus.find_by(
+                service_status = InstanceServiceStatus.find_by(
                    instance_id=db.id)
-                if not datastore_status.status:  # This should never happen.
+                if not service_status.status:  # This should never happen.
                    LOG.error("Server status could not be read for "
                              "instance id(%s).", db.id)
                    continue
@ -1839,24 +1851,15 @@ class Instances(object):
                # Get the real-time service status.
                LOG.debug('Task status for instance %s: %s', db.id,
                          db.task_status)
-                if db.task_status == InstanceTasks.NONE:
-                    last_heartbeat_delta = (
-                        timeutils.utcnow() - datastore_status.updated_at)
-                    agent_expiry_interval = timedelta(
-                        seconds=CONF.agent_heartbeat_expiry)
-                    if last_heartbeat_delta > agent_expiry_interval:
-                        LOG.warning(
-                            'Guest agent heartbeat for instance %s has '
-                            'expried', id)
-                        datastore_status.status = \
-                            srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT
+                update_service_status(db.task_status, service_status, db.id)
            except exception.ModelNotFoundError:
                LOG.error("Server status could not be read for "
                          "instance id(%s).", db.id)
                continue

-            ret.append(load_instance(context, db, datastore_status,
-                                     server=server))
+            ret.append(
+                load_instance(context, db, service_status, server=server)
+            )
        return ret


--- a/trove/instance/service_status.py
+++ b/trove/instance/service_status.py
@ -36,6 +36,7 @@ class ServiceStatus(object):
            ServiceStatuses.CRASHED._code,
            ServiceStatuses.BLOCKED._code,
            ServiceStatuses.HEALTHY._code,
+            ServiceStatuses.RESTART_REQUIRED._code,
        ]
        return self._code in allowed_statuses

--- a/trove/tests/unittests/instance/test_service.py
+++ b/trove/tests/unittests/instance/test_service.py
@ -11,11 +11,13 @@
 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
+from datetime import timedelta
 from unittest import mock

 from trove.common import cfg
 from trove.common import clients
 from trove.common import exception
+from trove.common import timeutils
 from trove.datastore import models as ds_models
 from trove.instance import models as ins_models
 from trove.instance import service
@ -174,3 +176,80 @@ class TestInstanceController(trove_testtools.TestCase):
            mock.ANY, "upgrade",
            instance_id=instance.id,
            datastore_version_id=new_ds_version.id)
+
+    @mock.patch('trove.instance.models.load_server_group_info')
+    @mock.patch('trove.instance.models.load_guest_info')
+    @mock.patch('trove.instance.models.load_simple_instance_addresses')
+    @mock.patch('trove.instance.models.load_simple_instance_server_status')
+    def test_show_with_restart_required(self, load_server_mock,
+                                        load_addr_mock, load_guest_mock,
+                                        load_server_grp_mock):
+        # Create an instance in db.
+        instance = ins_models.DBInstance.create(
+            name=self.random_name('instance'),
+            flavor_id=self.random_uuid(),
+            tenant_id=self.random_uuid(),
+            volume_size=1,
+            datastore_version_id=self.ds_version_imageid.id,
+            task_status=ins_models.InstanceTasks.NONE,
+            compute_instance_id=self.random_uuid(),
+            server_status='ACTIVE'
+        )
+        ins_models.InstanceServiceStatus.create(
+            instance_id=instance.id,
+            status=srvstatus.ServiceStatuses.RESTART_REQUIRED,
+        )
+
+        # workaround to reset updated_at field.
+        service_status = ins_models.InstanceServiceStatus.find_by(
+            instance_id=instance.id)
+        service_status.updated_at = timeutils.utcnow() - timedelta(
+            seconds=(CONF.agent_heartbeat_expiry + 60))
+        ins_models.get_db_api().save(service_status)
+
+        ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id)
+        self.assertEqual(200, ret.status)
+
+        ret_instance = ret.data(None)['instance']
+
+        self.assertEqual('ACTIVE', ret_instance.get('status'))
+        self.assertEqual('RESTART_REQUIRED',
+                         ret_instance.get('operating_status'))
+
+    @mock.patch('trove.instance.models.load_server_group_info')
+    @mock.patch('trove.instance.models.load_guest_info')
+    @mock.patch('trove.instance.models.load_simple_instance_addresses')
+    @mock.patch('trove.instance.models.load_simple_instance_server_status')
+    def test_show_without_restart_required(self, load_server_mock,
+                                           load_addr_mock, load_guest_mock,
+                                           load_server_grp_mock):
+        # Create an instance in db.
+        instance = ins_models.DBInstance.create(
+            name=self.random_name('instance'),
+            flavor_id=self.random_uuid(),
+            tenant_id=self.random_uuid(),
+            volume_size=1,
+            datastore_version_id=self.ds_version_imageid.id,
+            task_status=ins_models.InstanceTasks.NONE,
+            compute_instance_id=self.random_uuid(),
+            server_status='ACTIVE'
+        )
+        ins_models.InstanceServiceStatus.create(
+            instance_id=instance.id,
+            status=srvstatus.ServiceStatuses.HEALTHY,
+        )
+
+        # workaround to reset updated_at field.
+        service_status = ins_models.InstanceServiceStatus.find_by(
+            instance_id=instance.id)
+        service_status.updated_at = timeutils.utcnow() - timedelta(
+            seconds=(CONF.agent_heartbeat_expiry + 60))
+        ins_models.get_db_api().save(service_status)
+
+        ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id)
+        self.assertEqual(200, ret.status)
+
+        ret_instance = ret.data(None)['instance']
+
+        self.assertEqual('ACTIVE', ret_instance.get('status'))
+        self.assertEqual('ERROR', ret_instance.get('operating_status'))