Avoid reboot loop when patch fails

When there is a fail during remote pull a reboot-required patch to a
host, the system enters a reboot loop mode. This happens because the
reboot required flag was left set even with a failure in the
installation process.

This commit fixes it by checking if the flag still exists after no
changes were made by the install, it deletes the flag and returns a
proper error.

Test-plan:
PASS: Fail a patch during deploy host
      - The system should set the state to host-failed
      - The system must not enter in reboot loop

Story: 2010676
Task: 51192

Change-Id: Ib311f28911620cd14df357e06ff9e5afcf82b745
Signed-off-by: Lindley Vieira <lindley.vieira@windriver.com>
This commit is contained in:
Lindley Vieira 2024-10-18 16:19:44 -03:00
parent c4f8751daa
commit bc30e3464e
2 changed files with 22 additions and 16 deletions

@ -288,7 +288,7 @@ def pull_ostree_from_remote(remote=None):
ref_cmd = "ostree refs --force --create=%s %s" % (ref, constants.OSTREE_REF)
try:
subprocess.run(cmd % ref, shell=True, check=True, capture_output=True)
output = subprocess.run(cmd % ref, shell=True, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
msg = "Failed to pull from %s remote into sysroot ostree" % ref
err_msg = "OSTree Pull Error: return code: %s, Output: %s" \
@ -296,6 +296,10 @@ def pull_ostree_from_remote(remote=None):
LOG.exception(err_msg)
raise OSTreeCommandFail(msg)
# Log to help identify errors
msg = "Remote pull output: %s" % output
LOG.info(msg)
if ref_cmd:
try:
subprocess.run(ref_cmd, shell=True, check=True, capture_output=True)

@ -510,6 +510,12 @@ class PatchAgent(PatchService):
self.listener.bind(('', self.port))
self.listener.listen(2) # Allow two connections, for two controllers
def set_install_failed_flags(self):
"""Set flags and states for a failed patch"""
self.patch_failed = True
setflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
def query(self, major_release=None):
"""Check current patch state """
if not self.install_local and not check_install_uuid():
@ -574,10 +580,7 @@ class PatchAgent(PatchService):
# controller, we don't want to install patches.
if not self.install_local and not check_install_uuid():
LOG.error("Failed install_uuid check. Skipping install")
self.patch_failed = True
setflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
self.set_install_failed_flags()
# Send a hello to provide a state update
if self.sock_out is not None:
@ -611,9 +614,7 @@ class PatchAgent(PatchService):
clearflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_IDLE
else:
self.patch_failed = True
setflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
self.set_install_failed_flags()
return success
# prepare major release deployment
@ -754,22 +755,23 @@ class PatchAgent(PatchService):
except Exception as e:
LOG.exception("Failure running hooks: %s" % str(e))
setflag(run_hooks_flag)
self.patch_failed = True
setflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
self.set_install_failed_flags()
success = False
else:
# Update the patch_failed flag
self.patch_failed = True
setflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
self.set_install_failed_flags()
clearflag(patch_installing_file)
self.query()
self.query() # Update self.changes
if self.changes:
LOG.warning("Installing the patch did not change the patch current status")
if os.path.exists(node_is_software_updated_rr_file):
LOG.error("No deployment created and reboot required flag exists")
self.set_install_failed_flags()
# Clear flag to avoid reboot loop
clearflag(node_is_software_updated_rr_file)
# Send a hello to provide a state update
if self.sock_out is not None:
hello_ack = PatchMessageHelloAgentAck()