Files
update/software/software/software_agent.py
Heitor Matsui f1ea56351f Add lvm snapshot removal to the deploy delete request
On deploy delete for a major release deployment, a message is
sent from the software-controller to all hosts, requesting them
to perform cleanup tasks related to the upgrade process.

This commit includes the step to clean up lvm snapshots, in case
the feature was used on the deploy start. There are no need for
special treatment to check if the system is AIO-SX or snapshots
were created since:
- If system is not AIO-SX, the feature will not be activated,
  so there won't be any snapshots
- If there aren't any snapshots, the snapshot delete function
  will just ignore it and return success

Test Plan (AIO-SX only):
PASS: run deploy delete with no lvm snapshots created, verify
      the process finishes successfully
PASS: run deploy delete with existing lvm snapshots, verify
      the process deletes them and finishes successfully

Story: 2011357
Task:

Change-Id: I7e71d33eed5be166afbf6658f6392608b9ffe8fe
Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
2025-04-09 17:23:22 -03:00

1071 lines
41 KiB
Python

"""
Copyright (c) 2024-2025 Wind River Systems, Inc.
SPDX-License-Identifier: Apache-2.0
"""
import json
import os
import random
import requests
import select
import shutil
import socket
import subprocess
import sys
import time
import software.ostree_utils as ostree_utils
from software.software_functions import configure_logging
from software.software_functions import execute_agent_hooks
from software.software_functions import remove_major_release_deployment_flags
from software.software_functions import LvmSnapshotManager
from software.software_functions import LOG
import software.config as cfg
from software.base import PatchService
from software.exceptions import OSTreeCommandFail
import software.utils as utils
import software.messages as messages
import software.constants as constants
from tsconfig.tsconfig import http_port
from tsconfig.tsconfig import install_uuid
from tsconfig.tsconfig import subfunctions
from tsconfig.tsconfig import SW_VERSION
SOFTWARE_PERSIST_FOLDER = "/var/persist/software-agent"
pidfile_path = "/var/run/software_agent.pid"
node_is_patched_file = "/var/run/node_is_patched"
node_is_software_updated_rr_file = "%s/node_is_software_updated_rr" % SOFTWARE_PERSIST_FOLDER
patch_installing_file = "%s/patch_installing" % SOFTWARE_PERSIST_FOLDER
patch_failed_file = "/var/run/software_install_failed"
node_is_locked_file = "/var/run/.node_locked"
ostree_pull_completed_deployment_pending_file = \
"/var/run/ostree_pull_completed_deployment_pending"
run_hooks_flag = "/var/run/run_hooks"
mount_pending_file = "/var/run/mount_pending"
insvc_software_scripts = "/run/software/software-scripts"
insvc_software_flags = "/run/software/software-flags"
insvc_software_restart_agent = "/run/software/.restart.software-agent"
run_install_software_scripts_cmd = "/usr/sbin/run-software-scripts"
pa = None
http_port_real = http_port
def setflag(fname):
try:
with open(fname, "w") as f:
f.write("%d\n" % os.getpid())
except Exception:
LOG.exception("Failed to update %s flag", fname)
def clearflag(fname):
if os.path.exists(fname):
try:
os.remove(fname)
except Exception:
LOG.exception("Failed to clear %s flag", fname)
def pull_install_scripts_from_controller(install_local=False):
# If the rsync fails, it raises an exception to
# the caller "handle_install()" and fails the
# host-install request for this host.
# The restart_scripts are optional, so if the files
# are not present, it should not raise any exception
host = constants.CONTROLLER
if install_local:
host = '127.0.0.1'
try:
output = subprocess.check_output(["rsync",
"-acv",
"--delete",
"--exclude", "tmp",
"rsync://%s/repo/software-scripts/" % host,
"%s/" % insvc_software_scripts],
stderr=subprocess.STDOUT)
LOG.info("Synced restart scripts from controller: %s", output)
except subprocess.CalledProcessError as e:
if "No such file or directory" in e.output.decode("utf-8"):
LOG.info("No restart scripts contained in the release")
else:
LOG.exception("Failed to sync restart scripts from controller")
raise
def run_post_install_script():
LOG.info("Running post-install patch-scripts")
try:
subprocess.check_output([run_install_software_scripts_cmd, "postinstall"],
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
LOG.exception("Failed to execute post-install scripts.")
LOG.error("Command output: %s", e.output)
def check_install_uuid():
controller_install_uuid_url = "http://%s:%s/feed/rel-%s/install_uuid" % (constants.CONTROLLER,
http_port_real,
SW_VERSION)
try:
req = requests.get(controller_install_uuid_url)
if req.status_code != 200:
# If we're on controller-1, controller-0 may not have the install_uuid
# matching this release, if we're in an upgrade. If the file doesn't exist,
# bypass this check
if socket.gethostname() == "controller-1":
return True
LOG.error("Failed to get install_uuid from controller")
return False
except requests.ConnectionError:
LOG.error("Failed to connect to controller")
return False
controller_install_uuid = str(req.text).rstrip()
if install_uuid != controller_install_uuid:
LOG.error("Local install_uuid=%s doesn't match controller=%s", install_uuid, controller_install_uuid)
return False
return True
class PatchMessageSendLatestFeedCommit(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_SEND_LATEST_FEED_COMMIT)
def decode(self, data):
global pa
messages.PatchMessage.decode(self, data)
if 'latest_feed_commit' in data:
pa.latest_feed_commit = data['latest_feed_commit']
def encode(self):
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
global pa
# Check if the node is patch current
pa.query()
class PatchMessageHelloAgent(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_HELLO_AGENT)
self.patch_op_counter = 0
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'patch_op_counter' in data:
self.patch_op_counter = data['patch_op_counter']
def encode(self):
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
# Send response
#
# If a user tries to do a host-install on an unlocked node,
# without bypassing the lock check (either via in-service
# patch or --force option), the agent will set its state
# to Install-Rejected in order to report back the rejection.
# However, since this should just be a transient state,
# we don't want the client reporting the Install-Rejected
# state indefinitely, so reset it to Idle after a minute or so.
#
if pa.state == constants.PATCH_AGENT_STATE_INSTALL_REJECTED:
if os.path.exists(node_is_locked_file):
# Node has been locked since rejected attempt. Reset the state
pa.state = constants.PATCH_AGENT_STATE_IDLE
elif (time.time() - pa.rejection_timestamp) > 60:
# Rejected state for more than a minute. Reset it.
pa.state = constants.PATCH_AGENT_STATE_IDLE
if self.patch_op_counter > 0:
pa.handle_patch_op_counter(self.patch_op_counter)
resp = PatchMessageHelloAgentAck()
resp.send(sock)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchMessageHelloAgentAck(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_HELLO_AGENT_ACK)
def encode(self):
global pa
messages.PatchMessage.encode(self)
self.message['query_id'] = pa.query_id
self.message['out_of_date'] = pa.changes
self.message['hostname'] = socket.gethostname()
self.message['requires_reboot'] = pa.node_is_patched
self.message['patch_failed'] = pa.patch_failed
self.message['sw_version'] = SW_VERSION
self.message['state'] = pa.state
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
global pa
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (pa.controller_address, cfg.controller_port))
class PatchMessageQueryDetailed(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_QUERY_DETAILED)
def decode(self, data):
messages.PatchMessage.decode(self, data)
def encode(self):
# Nothing to add to the HELLO_AGENT, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
# Send response
LOG.info("Handling detailed query")
resp = PatchMessageQueryDetailedResp()
resp.send(sock)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchMessageQueryDetailedResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_QUERY_DETAILED_RESP)
def encode(self):
global pa
messages.PatchMessage.encode(self)
self.message['latest_sysroot_commit'] = pa.latest_sysroot_commit
self.message['nodetype'] = cfg.nodetype
self.message['sw_version'] = SW_VERSION
self.message['subfunctions'] = subfunctions
self.message['state'] = pa.state
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
self.encode()
message = json.dumps(self.message)
sock.sendall(str.encode(message))
class PatchMessageAgentInstallReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_REQ)
self.force = False
self.major_release = None
self.commit_id = None
self.additional_data = {}
def decode(self, data):
messages.PatchMessage.decode(self, data)
msg = f"Received InstallReq {data}"
LOG.info(msg)
if 'force' in data:
self.force = data['force']
if 'major_release' in data:
self.major_release = data['major_release']
if 'commit_id' in data:
self.commit_id = data['commit_id']
if 'additional_data' in data:
self.additional_data = data['additional_data']
def encode(self):
# Nothing to add to the HELLO_AGENT, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
LOG.info("Handling host install request, force=%s, major_release=%s, commit_id=%s",
self.force, self.major_release, self.commit_id)
global pa
resp = PatchMessageAgentInstallResp()
if not self.force:
setflag(node_is_software_updated_rr_file)
resp.reboot_required = True
if not os.path.exists(node_is_locked_file):
if self.force:
LOG.info("Installing on unlocked node, with force option")
else:
LOG.info("Rejecting install request on unlocked node")
pa.state = constants.PATCH_AGENT_STATE_INSTALL_REJECTED
pa.rejection_timestamp = time.time()
resp.status = False
resp.reject_reason = 'Node must be locked.'
resp.send(sock, addr)
return
resp.status = pa.handle_install(major_release=self.major_release,
commit_id=self.commit_id,
additional_data=self.additional_data)
resp.send(sock, addr)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchMessageAgentInstallResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
self.status = False
self.reject_reason = None
self.reboot_required = False
def encode(self):
global pa
messages.PatchMessage.encode(self)
self.message['status'] = self.status
if self.reject_reason is not None:
self.message['reject_reason'] = self.reject_reason
self.message['reboot_required'] = self.reboot_required
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock, addr):
address = (addr[0], cfg.controller_port)
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), address)
# Send a hello ack to follow it
resp = PatchMessageHelloAgentAck()
resp.send(sock)
class SoftwareMessageDeployDeleteCleanupReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_DELETE_CLEANUP_REQ)
self.major_release = None
def decode(self, data):
messages.PatchMessage.encode(self)
if "major_release" in data:
self.major_release = data["major_release"]
def handle(self, sock, addr):
LOG.info("Handling deploy delete cleanup request, major_release=%s" % self.major_release)
# remove temporary remote and ref created during the upgrade process
success_ostree_remote_cleanup = ostree_utils.delete_temporary_refs_and_remotes()
# update the default remote 'debian' to point to the to-release feed
nodetype = utils.get_platform_conf("nodetype")
success_ostree_remote_update = ostree_utils.add_ostree_remote(
self.major_release, nodetype, replace_default_remote=True)
# remove the local upgrade flags created for the upgrade process
success_remove_upgrade_flags = remove_major_release_deployment_flags()
# undeploy the from-release ostree deployment to free sysroot disk space
success_ostree_undeploy_from_release = ostree_utils.undeploy_inactive_deployments()
# remove the lvm snapshots created during the upgrade process
success_remove_lvm_snapshots = True
try:
lsm = LvmSnapshotManager()
lsm.delete_snapshots()
except Exception:
success_remove_lvm_snapshots = False
cleanup_results = [
(success_ostree_remote_cleanup, "cleaning temporary refs/remotes"),
(success_ostree_remote_update, "updating default remote"),
(success_remove_upgrade_flags, "removing local upgrade flags"),
(success_ostree_undeploy_from_release, "undeploying from-release ostree deployment"),
(success_remove_lvm_snapshots, "removing LVM snapshots"),
]
for result, log_msg in cleanup_results:
if result:
LOG.info("Success %s" % log_msg)
else:
LOG.error("Failure %s, manual cleanup is required" % log_msg)
success = all(x is True for x, _ in cleanup_results)
resp = SoftwareMessageDeployDeleteCleanupResp()
resp.success = success
resp.send(sock, addr)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class SoftwareMessageDeployDeleteCleanupResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_DELETE_CLEANUP_RESP)
self.success = None
def encode(self):
messages.PatchMessage.encode(self)
self.message["success"] = self.success
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock, addr):
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (addr[0], cfg.controller_port))
class SoftwareMessageCheckAgentAliveReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_CHECK_AGENT_ALIVE_REQ)
def decode(self, data):
messages.PatchMessage.decode(self, data)
def handle(self, sock, addr):
LOG.info("Handling check agent alive from %s", addr[0])
check_alive_resp = SoftwareMessageCheckAgentAliveResp()
check_alive_resp.send(sock, addr)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class SoftwareMessageCheckAgentAliveResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_CHECK_AGENT_ALIVE_RESP)
def encode(self):
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock, addr):
LOG.info("Sending check agent alive resp to %s", addr[0])
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (addr[0], cfg.controller_port))
class PatchAgent(PatchService):
def __init__(self):
PatchService.__init__(self)
self.sock_out = None
self.sock_in = None
self.controller_address = None
self.listener = None
self.changes = False
self.latest_feed_commit = None
self.latest_sysroot_commit = None
self.patch_op_counter = 0
self.node_is_patched = os.path.exists(node_is_patched_file)
self.node_is_patched_timestamp = 0
self.query_id = 0
self.state = constants.PATCH_AGENT_STATE_IDLE
self.last_config_audit = 0
self.rejection_timestamp = 0
self.last_repo_revision = None
# Create persist folder
if not os.path.exists(SOFTWARE_PERSIST_FOLDER):
try:
os.makedirs(SOFTWARE_PERSIST_FOLDER, exist_ok=True)
except PermissionError:
LOG.error("Permission denied: Cannot create %s", SOFTWARE_PERSIST_FOLDER)
# Check state flags
if os.path.exists(patch_installing_file):
# We restarted while installing. Change to failed
setflag(patch_failed_file)
clearflag(patch_installing_file)
if os.path.exists(patch_failed_file):
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
self.patch_failed = os.path.exists(patch_failed_file)
def update_config(self):
cfg.read_config()
if self.port != cfg.agent_port:
self.port = cfg.agent_port
# Loopback interface does not support multicast messaging, therefore
# revert to using unicast messaging when configured against the
# loopback device
if self.pre_bootstrap:
self.mcast_addr = None
self.controller_address = utils.gethostbyname(constants.PREBOOTSTRAP_HOSTNAME)
elif cfg.get_mgmt_iface() == constants.LOOPBACK_INTERFACE_NAME:
self.mcast_addr = None
self.controller_address = cfg.get_mgmt_ip()
else:
self.mcast_addr = cfg.agent_mcast_group
self.controller_address = cfg.controller_mcast_group
def setup_tcp_socket(self):
hostname = None
if self.pre_bootstrap:
hostname = constants.PREBOOTSTRAP_HOSTNAME
address_family = utils.get_management_family(hostname)
if self.listener is not None:
self.listener.close()
self.listener = socket.socket(address_family, socket.SOCK_STREAM)
self.listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.listener.bind(('', self.port))
self.listener.listen(2) # Allow two connections, for two controllers
def set_install_failed_flags(self):
"""Set flags and states for a failed patch"""
self.patch_failed = True
setflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_INSTALL_FAILED
def query(self, major_release=None):
"""Check current patch state """
if not self.install_local and not check_install_uuid():
LOG.info("Failed install_uuid check. Skipping query")
return False
# Generate a unique query id
self.query_id = random.random()
# determine OSTREE state of the system and the patches
self.changes = False
active_sysroot_commit = ostree_utils.get_sysroot_latest_commit()
self.latest_sysroot_commit = active_sysroot_commit
self.last_repo_revision = active_sysroot_commit
# checks if this is a major release deployment operation
if major_release:
self.changes = True
self.latest_feed_commit = None
return True
# latest_feed_commit is sent from patch controller
# if unprovisioned (no mgmt ip) attempt to query it
if self.latest_feed_commit is None:
if self.sock_out is None:
try:
self.latest_feed_commit = utils.get_controller_feed_latest_commit(SW_VERSION)
except OSTreeCommandFail:
LOG.warning("Unable to query latest feed commit")
# latest_feed_commit will remain as None
if self.latest_feed_commit:
if active_sysroot_commit != self.latest_feed_commit:
LOG.info("Active Sysroot Commit:%s does not match "
"active controller's Feed Repo Commit: %s",
active_sysroot_commit, self.latest_feed_commit)
self.changes = True
latest_deployment_commit = ostree_utils.get_latest_deployment_commit()
if latest_deployment_commit:
if latest_deployment_commit != self.latest_feed_commit:
LOG.info("Latest deployment Commit:%s does not match "
"active controller's Feed Repo Commit: %s",
latest_deployment_commit, self.latest_feed_commit)
self.changes = True
else:
self.changes = False
return True
def handle_install(self,
verbose_to_stdout=False,
disallow_insvc_patch=False,
major_release=None,
commit_id=None,
additional_data=None):
#
# The disallow_insvc_patch parameter is set when we're installing
# the patch during init. At that time, we don't want to deal with
# in-service patch scripts, so instead we'll treat any patch as
# a reboot-required when this parameter is set. Rather than running
# any scripts, the RR flag will be set, which will result in the node
# being rebooted immediately upon completion of the installation.
#
LOG.info("Handling install")
# Check the INSTALL_UUID first. If it doesn't match the active
# controller, we don't want to install patches.
if not self.install_local and not check_install_uuid():
LOG.error("Failed install_uuid check. Skipping install")
self.set_install_failed_flags()
# Send a hello to provide a state update
if self.sock_out is not None:
hello_ack = PatchMessageHelloAgentAck()
hello_ack.send(self.sock_out)
return False
# if commit-id is provided, check if it is already deployed
if commit_id:
active_commit_id = ostree_utils.get_latest_deployment_commit()
if commit_id == active_commit_id:
LOG.info("The provided commit-id is already deployed. Skipping install.")
success = True
# when in major release deployment, if hooks failed in a previous deploy
# host attempt, a flag is created so that their execution is reattempted here
if major_release and os.path.exists(run_hooks_flag):
LOG.info("Major release deployment %s flag found. "
"Running hooks." % run_hooks_flag)
try:
execute_agent_hooks(major_release, additional_data=additional_data)
clearflag(run_hooks_flag)
except Exception:
success = False
if success:
self.patch_failed = False
clearflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_IDLE
else:
self.set_install_failed_flags()
return success
# prepare major release deployment
remote = None
ref = None
if major_release:
LOG.info("Major release deployment for %s with commit %s" % (major_release, commit_id))
# add remote
nodetype = utils.get_platform_conf("nodetype")
remote = ostree_utils.add_ostree_remote(major_release, nodetype)
if not remote:
LOG.exception("Unable to continue major release deployment as "
"there was an error adding the remote.")
return False
LOG.info("OSTree remote added: %s" % remote)
# check if remote commit_id matches with the one sent by the controller
commit_id_match = ostree_utils.check_commit_id(remote, commit_id)
if not commit_id_match:
LOG.exception("The OSTree commit_id %s sent by the controller "
"doesn't match with the remote commit_id." % commit_id)
ostree_utils.delete_ostree_remote(remote)
ostree_utils.delete_ostree_ref(constants.OSTREE_REF)
LOG.info("OSTree remote deleted: %s" % remote)
return False
ref = "%s:%s" % (remote, constants.OSTREE_REF)
self.state = constants.PATCH_AGENT_STATE_INSTALLING
setflag(patch_installing_file)
try:
# Create insvc patch directories
if not os.path.exists(insvc_software_scripts):
os.makedirs(insvc_software_scripts, 0o700)
if not os.path.exists(insvc_software_flags):
os.makedirs(insvc_software_flags, 0o700)
except Exception:
LOG.exception("Failed to create in-service patch directories")
# Send a hello to provide a state update
if self.sock_out is not None:
hello_ack = PatchMessageHelloAgentAck()
hello_ack.send(self.sock_out)
# Build up the install set
if verbose_to_stdout:
print("Checking for software updates...")
self.query(major_release=major_release) # sets self.changes
changed = False
success = True
if self.changes or \
os.path.exists(ostree_pull_completed_deployment_pending_file) or \
os.path.exists(mount_pending_file):
try:
LOG.info("Running pre-install patch-scripts")
pull_install_scripts_from_controller(install_local=self.install_local)
subprocess.check_output([run_install_software_scripts_cmd, "preinstall"],
stderr=subprocess.STDOUT)
# Pull changes from remote to the sysroot ostree
# The remote value is configured inside
# "/sysroot/ostree/repo/config" file
ostree_utils.pull_ostree_from_remote(remote=remote)
self.query(major_release=major_release) # Updates following self variables
if self.latest_feed_commit:
# If latest_feed_commit is not null, the node can check the deployment health
if self.latest_sysroot_commit == self.latest_feed_commit:
LOG.info("Pull from remote was successful")
setflag(ostree_pull_completed_deployment_pending_file)
# Retry deployment creation until successful or maximum retries reached
retries = 0
while not changed and retries < constants.MAX_OSTREE_DEPLOY_RETRIES:
latest_deployment = None
try:
# Create a new deployment once the changes are pulled
ostree_utils.create_deployment(ref=ref)
latest_deployment = ostree_utils.get_latest_deployment_commit()
except OSTreeCommandFail:
LOG.warning("Failed to create deployment during host-install.")
if latest_deployment and \
latest_deployment == self.latest_feed_commit:
changed = True
clearflag(ostree_pull_completed_deployment_pending_file)
else:
retries += 1
LOG.warning("Deloyment not created in %d attempt(s)", retries)
time.sleep(10)
else:
LOG.error("Sysroot commit does not match the feed commit. "
"Skipping deployment retries")
else:
# If not able to retrieve latest_feed_commit, proceed without retries
LOG.info("Creating ostree deployment without live retry")
setflag(ostree_pull_completed_deployment_pending_file)
ostree_utils.create_deployment(ref=ref)
changed = True
clearflag(ostree_pull_completed_deployment_pending_file)
except OSTreeCommandFail:
LOG.exception("Failed to pull changes and create deployment"
"during host-install.")
except subprocess.CalledProcessError as e:
LOG.exception("Failed to execute pre-install scripts.")
LOG.error("Command output: %s", e.output)
success = changed # If a change was made, success is true otherwise false
if changed:
# Update the node_is_patched flag
setflag(node_is_patched_file)
self.node_is_patched = True
if verbose_to_stdout:
print("This node has been patched.")
if os.path.exists(node_is_software_updated_rr_file):
LOG.info("Reboot is required. Skipping patch-scripts")
elif disallow_insvc_patch:
LOG.info("Disallowing patch-scripts. Treating as reboot-required")
setflag(node_is_software_updated_rr_file)
else:
LOG.info("Mounting the new deployment")
try:
pending_deployment = ostree_utils.fetch_pending_deployment()
deployment_dir = constants.OSTREE_BASE_DEPLOYMENT_DIR + pending_deployment
setflag(mount_pending_file)
ostree_utils.mount_new_deployment(deployment_dir)
clearflag(mount_pending_file)
LOG.info("Running post-install patch-scripts")
subprocess.check_output([run_install_software_scripts_cmd, "postinstall"],
stderr=subprocess.STDOUT)
# Clear the node_is_patched flag, since we've handled it in-service
clearflag(node_is_patched_file)
self.node_is_patched = False
except subprocess.CalledProcessError as e:
LOG.exception("Failed to execute post-install scripts.")
LOG.error("Command output: %s", e.output)
success = False
# Clear the in-service patch dirs
if os.path.exists(insvc_software_scripts):
shutil.rmtree(insvc_software_scripts, ignore_errors=True)
if os.path.exists(insvc_software_flags):
shutil.rmtree(insvc_software_flags, ignore_errors=True)
if success:
self.patch_failed = False
clearflag(patch_failed_file)
self.state = constants.PATCH_AGENT_STATE_IDLE
# run deploy host hooks for major release
if major_release:
try:
execute_agent_hooks(major_release, additional_data=additional_data)
except Exception:
setflag(run_hooks_flag)
self.set_install_failed_flags()
success = False
else:
self.set_install_failed_flags()
clearflag(patch_installing_file)
self.query() # Update self.changes
if self.changes:
LOG.warning("Installing the patch did not change the patch current status")
if os.path.exists(node_is_software_updated_rr_file):
LOG.error("No deployment created and reboot required flag exists")
self.set_install_failed_flags()
# Clear flag to avoid reboot loop
clearflag(node_is_software_updated_rr_file)
# Send a hello to provide a state update
if self.sock_out is not None:
hello_ack = PatchMessageHelloAgentAck()
hello_ack.send(self.sock_out)
# Indicate if the method was successful
# success means no change needed, or a change worked.
return success
def handle_patch_op_counter(self, counter):
changed = False
if os.path.exists(node_is_patched_file):
# The node has been patched. Run a query if:
# - node_is_patched didn't exist previously
# - node_is_patched timestamp changed
timestamp = os.path.getmtime(node_is_patched_file)
if not self.node_is_patched:
self.node_is_patched = True
self.node_is_patched_timestamp = timestamp
changed = True
elif self.node_is_patched_timestamp != timestamp:
self.node_is_patched_timestamp = timestamp
changed = True
elif self.node_is_patched:
self.node_is_patched = False
self.node_is_patched_timestamp = 0
changed = True
if self.patch_op_counter < counter:
self.patch_op_counter = counter
changed = True
if changed:
rc = self.query()
if not rc:
# Query failed. Reset the op counter
self.patch_op_counter = 0
def handle_bootstrap(self, connections):
# If bootstrap is completed re-initialize sockets
self.pre_bootstrap = False
self.install_local = False
self.setup_socket()
while self.sock_out is None:
time.sleep(30)
self.setup_socket()
self.setup_tcp_socket()
hello_ack = PatchMessageHelloAgentAck()
hello_ack.send(self.sock_out)
for s in connections:
connections.remove(s)
s.close()
def run(self):
# Check if bootstrap stage is completed
if self.pre_bootstrap and cfg.get_mgmt_ip():
self.pre_bootstrap = False
if self.pre_bootstrap or os.path.isfile(constants.INSTALL_LOCAL_FLAG):
self.install_local = True
else:
self.install_local = False
self.setup_socket()
while self.sock_out is None:
# Check every thirty seconds?
# Once we've got a conf file, tied into packstack,
# we'll get restarted when the file is updated,
# and this should be unnecessary.
time.sleep(30)
self.setup_socket()
self.setup_tcp_socket()
# Ok, now we've got our socket.
# Let's let the controllers know we're here
hello_ack = PatchMessageHelloAgentAck()
hello_ack.send(self.sock_out)
first_hello = True
connections = []
timeout = time.time() + 30.0
remaining = 30
while True:
if self.pre_bootstrap and cfg.get_mgmt_ip():
self.handle_bootstrap(connections)
first_hello = True
if os.path.isfile(constants.INSTALL_LOCAL_FLAG) or self.pre_bootstrap:
self.install_local = True
else:
self.install_local = False
inputs = [self.sock_in, self.listener] + connections
outputs = []
rlist, wlist, xlist = select.select(inputs, outputs, inputs, remaining)
remaining = int(timeout - time.time())
if remaining <= 0 or remaining > 30:
timeout = time.time() + 30.0
remaining = 30
if (len(rlist) == 0 and
len(wlist) == 0 and
len(xlist) == 0):
# Timeout hit
self.audit_socket()
continue
for s in rlist:
if s == self.listener:
conn, addr = s.accept()
connections.append(conn)
continue
data = ''
addr = None
msg = None
if s == self.sock_in:
# Receive from UDP
data, addr = s.recvfrom(1024)
else:
# Receive from TCP
while True:
try:
packet = s.recv(1024)
except socket.error:
LOG.exception("Socket error on recv")
data = ''
break
if packet:
data += packet.decode()
if data == '':
break
try:
json.loads(data)
break
except ValueError:
# Message is incomplete
continue
else:
# End of TCP message received
break
if data == '':
# Connection dropped
connections.remove(s)
s.close()
continue
msgdata = json.loads(data)
# For now, discard any messages that are not msgversion==1
if 'msgversion' in msgdata and msgdata['msgversion'] != 1:
continue
if 'msgtype' in msgdata:
if msgdata['msgtype'] == messages.PATCHMSG_HELLO_AGENT:
if first_hello:
self.query()
first_hello = False
msg = PatchMessageHelloAgent()
elif msgdata['msgtype'] == messages.PATCHMSG_QUERY_DETAILED:
msg = PatchMessageQueryDetailed()
elif msgdata['msgtype'] == messages.PATCHMSG_SEND_LATEST_FEED_COMMIT:
msg = PatchMessageSendLatestFeedCommit()
elif msgdata['msgtype'] == messages.PATCHMSG_AGENT_INSTALL_REQ:
msg = PatchMessageAgentInstallReq()
elif msgdata['msgtype'] == messages.PATCHMSG_DEPLOY_DELETE_CLEANUP_REQ:
msg = SoftwareMessageDeployDeleteCleanupReq()
elif msgdata['msgtype'] == messages.PATCHMSG_CHECK_AGENT_ALIVE_REQ:
msg = SoftwareMessageCheckAgentAliveReq()
if msg is None:
msg = messages.PatchMessage()
msg.decode(msgdata)
if s == self.sock_in:
msg.handle(self.sock_out, addr)
else:
msg.handle(s, addr)
for s in xlist:
if s in connections:
connections.remove(s)
s.close()
# Check for in-service patch restart flag
if os.path.exists(insvc_software_restart_agent):
# Make sure it's safe to restart, ie. no reqs queued
rlist, wlist, xlist = select.select(inputs, outputs, inputs, 0)
if (len(rlist) == 0 and
len(wlist) == 0 and
len(xlist) == 0):
# Restart
LOG.info("In-service software restart flag detected. Exiting.")
os.remove(insvc_software_restart_agent)
exit(0)
def main():
global pa
configure_logging()
cfg.read_config()
pa = PatchAgent()
if os.path.isfile(constants.INSTALL_LOCAL_FLAG):
pa.install_local = True
else:
pa.install_local = False
pa.query()
# Run on reboot after the node was updated by a reboot required patch/ISO
if os.path.exists(node_is_software_updated_rr_file):
ostree_utils.delete_older_deployments()
run_post_install_script()
clearflag(node_is_software_updated_rr_file)
if len(sys.argv) <= 1:
pa.run()
elif sys.argv[1] == "--install":
if not pa.install_local and not check_install_uuid():
# In certain cases, the lighttpd server could still be running using
# its default port 80, as opposed to the port configured in platform.conf
global http_port_real
LOG.info("Failed install_uuid check via http_port=%s. Trying with default port 80", http_port_real)
http_port_real = 80
pa.handle_install(verbose_to_stdout=True,
disallow_insvc_patch=True)
elif sys.argv[1] == "--status":
rc = 0
if pa.changes:
rc = 1
exit(rc)