Follow up changes for swact check to work

This commit is an addition to the proxy API [1], to added semantic check
for USM upgrade to restrict host-swact and host-unlock.

This commit also removed the check that prevents upgrade and downgrade
when new hardware is detected during upgrade procedure.

Test Plan:

PASS: run the upgrade on DX with USM, observed the restrictions take place

Task: 49798
Story: 2010676

[1] https://review.opendev.org/c/starlingx/config/+/914974

Change-Id: I8ded9faf7691ce849d51ef39f7598f287c6f1ca4
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian 2024-04-02 14:33:38 +00:00
parent 46321c2402
commit 0427121532
4 changed files with 115 additions and 203 deletions

View File

@ -1,5 +1,4 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright 2013 Hewlett-Packard Development Company, L.P.
# All Rights Reserved.
@ -2880,12 +2879,6 @@ class HostController(rest.RestController):
"all osds must be down.")
% (rpc_ihost.hostname))
if upgrade.state in [constants.UPGRADE_STARTED]:
LOG.info("host-upgrade check upgrade_refresh %s" %
rpc_ihost.hostname)
force = body.get('force', False) is True
self._semantic_check_upgrade_refresh(upgrade, rpc_ihost, force)
# Update the target load for this host
self._update_load(uuid, body, new_target_load)
@ -2983,10 +2976,6 @@ class HostController(rest.RestController):
"host-downgrade rejected: Upgrade not in %s state." %
constants.UPGRADE_ABORTING))
# Check for new hardware since upgrade-start
force = body.get('force', False) is True
self._semantic_check_downgrade_refresh(upgrade, rpc_ihost, force)
# Remove the host manifest. This is similar to the process taken
# during host-reinstall. The manifest needs to be removed to prevent
# the host from running kubeadm prematurely.
@ -3778,12 +3767,21 @@ class HostController(rest.RestController):
# Check if there's an upgrade in progress
upgrade = usm_service.get_platform_upgrade(pecan.request.dbapi)
if upgrade.state == constants.UPGRADE_UPGRADING_CONTROLLERS:
# TODO (bqian) this block will be removed once legacy upgrade is no longer supported
host_upgrade = objects.host_upgrade.get_by_host_id(
pecan.request.context, ihost['id'])
if host_upgrade.software_load == upgrade.from_load:
raise wsme.exc.ClientSideError(
_("Upgrade is in progress. At this time %s cannot be unlocked"
% ihost['hostname']))
elif upgrade.state in [constants.DEPLOY_STATE_HOST, constants.DEPLOY_STATE_HOST_FAILED]:
# USM upgrade, host-unlock is allowed only when it is deployed
host_upgrade = usm_service.get_host_deploy(pecan.request.dbapi, ihost['hostname'])
if host_upgrade is not None:
if host_upgrade['host_state'] != constants.DEPLOY_HOST_DEPLOYED:
raise wsme.exc.ClientSideError(
_("Upgrade is in progress. At this time %s cannot be unlocked"
% ihost['hostname']))
except exception.NotFound:
pass
@ -3807,9 +3805,6 @@ class HostController(rest.RestController):
ihost['hostname'])
raise wsme.exc.ClientSideError(msg)
# Check for new hardware since upgrade-start
self._semantic_check_upgrade_refresh(upgrade, ihost, force_unlock)
@staticmethod
def _semantic_check_duplex_oam_config(ihost):
system = pecan.request.dbapi.isystem_get_one()
@ -4584,131 +4579,6 @@ class HostController(rest.RestController):
'ok': constants.SB_STATE_CONFIGURED}
raise wsme.exc.ClientSideError(msg)
@staticmethod
def _new_host_hardware_since_upgrade(host, upgrade_created_at):
"""
Determines the new hardware on the host since the upgrade started.
:param host host object
:param upgrade_created_at upgrade start timestamp
returns: new_hw tuple of new hardware on host
"""
new_hw = []
disks = pecan.request.dbapi.idisk_get_by_ihost(host.id)
new_disks = [x.uuid for x in disks
if x.created_at and (x.created_at > upgrade_created_at)]
if new_disks:
new_hw.append(('disks', host.hostname, new_disks))
interfaces = pecan.request.dbapi.iinterface_get_by_ihost(host.id)
new_interfaces = [
x.uuid for x in interfaces
if x.created_at and (x.created_at > upgrade_created_at)]
if new_interfaces:
new_hw.append(('interfaces', host.hostname, new_interfaces))
stors = pecan.request.dbapi.istor_get_by_ihost(host.id)
new_stors = [x.uuid for x in stors
if x.created_at and (x.created_at > upgrade_created_at)]
if new_stors:
new_hw.append(('stors', host.hostname, new_stors))
return new_hw
def _semantic_check_upgrade_refresh(self, upgrade, ihost, force):
"""
Determine whether upgrade should be aborted/refreshed due to
new hardware since upgrade start
"""
if force:
LOG.info("_semantic_check_upgrade_refresh check force")
return
if ihost['hostname'] != constants.CONTROLLER_1_HOSTNAME:
return
if upgrade.state not in [constants.UPGRADE_STARTED,
constants.UPGRADE_DATA_MIGRATION,
constants.UPGRADE_DATA_MIGRATION_COMPLETE,
constants.UPGRADE_UPGRADING_CONTROLLERS]:
LOG.info("_semantic_check_upgrade_refresh allow upgrade state=%s" %
upgrade.state)
return
upgrade_created_at = upgrade.created_at
# check for new host hardware since upgrade started
hosts = pecan.request.dbapi.ihost_get_list()
new_hw = []
for h in hosts:
if not h.personality:
continue
if h.created_at > upgrade_created_at:
new_hw.append(('host', h.hostname, h.uuid))
break
new_hw_h = self._new_host_hardware_since_upgrade(
h, upgrade_created_at)
if new_hw_h:
new_hw.extend(new_hw_h)
if new_hw:
msg = _("New hardware %s detected after upgrade started at %s. "
"Upgrade should be aborted."
% (new_hw, upgrade_created_at))
raise wsme.exc.ClientSideError(msg)
def _semantic_check_downgrade_refresh(self, upgrade, ihost, force):
"""
Determine whether downgrade should be aborted due to
new hardware since upgrade start
"""
if force:
LOG.info("_semantic_check_downgrade_refresh check force")
return
if upgrade.state not in [constants.UPGRADE_ABORTING,
constants.UPGRADE_ABORTING_ROLLBACK]:
LOG.info("_semantic_check_downgrade_refresh allow upgrade state=%s" %
upgrade.state)
return
upgrade_created_at = upgrade.created_at
# check for new host hardware since upgrade started
hosts = pecan.request.dbapi.ihost_get_list()
new_hw = []
for h in hosts:
if not h.personality:
continue
if h.created_at > upgrade_created_at:
new_hw.append(('host', h.hostname, h.uuid))
new_hw_h = self._new_host_hardware_since_upgrade(
h, upgrade_created_at)
if new_hw_h:
new_hw.extend(new_hw_h)
if new_hw:
new_host_hw = [(new_hw_type, name, info) for (new_hw_type, name, info) in new_hw
if name == ihost['hostname']]
if new_host_hw:
msg = _("New host %s detected after upgrade started at %s. "
"Host can not be downgraded."
% (ihost['hostname'], upgrade_created_at))
raise wsme.exc.ClientSideError(msg)
else:
# Acceptable to downgrade this host
msg = _("New host hardware %s detected after upgrade "
"started at %s. "
"Allow downgrade of %s during upgrade abort phase."
% (new_hw, upgrade_created_at, ihost['hostname']))
LOG.info(msg)
return
@staticmethod
def _semantic_check_nova_local_storage(ihost_uuid, personality, required=False):
"""
@ -6305,63 +6175,74 @@ class HostController(rest.RestController):
# No upgrade in progress so nothing to check
return
# Get the load running on the destination controller
# TODO(bqian) below should call USM for host upgrade for USM major release
# deploy
host_upgrade = objects.host_upgrade.get_by_host_id(
pecan.request.context, to_host['id'])
to_host_load_id = host_upgrade.software_load
if isinstance(upgrade, usm_service.UsmUpgrade):
to_host_deploy = usm_service.get_host_deploy(pecan.request.dbapi, to_host['hostname'])
if to_host_deploy['host_state'] == constants.DEPLOY_HOST_DEPLOYED:
# to host has deployed
pass
else:
from_host_deploy = usm_service.get_host_deploy(pecan.request.dbapi, from_host['hostname'])
if from_host_deploy['host_state'] == constants.DEPLOY_HOST_PENDING and \
to_host_deploy['host_state'] == constants.DEPLOY_HOST_PENDING:
# no host has started deploy yet
pass
else:
err_msg = "Swact is not allowed. " + \
"New release has not been deployed to %s" % to_host['hostname']
raise wsme.exc.ClientSideError(err_msg)
else:
# TODO (bqian) below to be removed after USM upgrade cutoff
host_upgrade = objects.host_upgrade.get_by_host_id(
pecan.request.context, to_host['id'])
to_host_load_id = host_upgrade.software_load
# Get the load names
from_sw_version = objects.load.get_by_uuid(
pecan.request.context, upgrade.from_load).software_version
to_sw_version = objects.load.get_by_uuid(
pecan.request.context, upgrade.to_load).software_version
to_host_sw_version = objects.load.get_by_uuid(
pecan.request.context, to_host_load_id).software_version
# Get the load names
from_sw_version = objects.load.get_by_uuid(
pecan.request.context, upgrade.from_load).software_version
to_sw_version = objects.load.get_by_uuid(
pecan.request.context, upgrade.to_load).software_version
to_host_sw_version = objects.load.get_by_uuid(
pecan.request.context, to_host_load_id).software_version
if upgrade.state in [constants.UPGRADE_STARTING,
constants.UPGRADE_STARTED,
constants.UPGRADE_DATA_MIGRATION]:
# Swacting controllers is not supported until database migration is complete
raise wsme.exc.ClientSideError(
_("Swact action not allowed. Upgrade state must be %s") %
(constants.UPGRADE_DATA_MIGRATION_COMPLETE))
activating_states = [constants.UPGRADE_ACTIVATION_REQUESTED,
constants.UPGRADE_ACTIVATING]
if upgrade.state in activating_states and not force_swact:
# Block swacts during activation to prevent interrupting the
# upgrade scripts.
# Allow swacts during UPGRADE_ACTIVATING_HOSTS as the active
# controller may need a lock/unlock if a runtime manifest fails.
# Allow force swacts for recovery in edge cases.
raise wsme.exc.ClientSideError(
_("Swact action not allowed. Wait until the upgrade-activate "
"command completes"))
if upgrade.state == constants.UPGRADE_ABORTING:
if to_host_load_id == upgrade.to_load:
# Cannot swact to new load if aborting upgrade
if upgrade.state in [constants.UPGRADE_STARTING,
constants.UPGRADE_STARTED,
constants.UPGRADE_DATA_MIGRATION]:
# Swacting controllers is not supported until database migration is complete
raise wsme.exc.ClientSideError(
_("Aborting upgrade: %s must be using load %s before this "
"operation can proceed. Currently using load %s.") %
(to_host['hostname'], from_sw_version, to_host_sw_version))
elif upgrade.state == constants.UPGRADE_ABORTING_ROLLBACK:
if from_host['software_load'] == from_sw_version and to_host.software_load == to_sw_version:
raise wsme.exc.ClientSideError(_("Aborting upgrade: Unable to swact from %s to %s")
% (from_sw_version, to_sw_version))
elif to_host_load_id == upgrade.from_load:
# On CPE loads we must abort before we swact back to the old load
# Any VMs on the active controller will be lost during the swact
if constants.WORKER in to_host.subfunctions:
raise wsme.exc.ClientSideError(
_("Upgrading: %s must be using load %s before this "
"operation can proceed. Currently using load %s.") %
(to_host['hostname'], to_sw_version, to_host_sw_version))
_("Swact action not allowed. Upgrade state must be %s") %
(constants.UPGRADE_DATA_MIGRATION_COMPLETE))
# Check for new hardware since upgrade-start
self._semantic_check_upgrade_refresh(upgrade, to_host, force_swact)
activating_states = [constants.UPGRADE_ACTIVATION_REQUESTED,
constants.UPGRADE_ACTIVATING]
if upgrade.state in activating_states and not force_swact:
# Block swacts during activation to prevent interrupting the
# upgrade scripts.
# Allow swacts during UPGRADE_ACTIVATING_HOSTS as the active
# controller may need a lock/unlock if a runtime manifest fails.
# Allow force swacts for recovery in edge cases.
raise wsme.exc.ClientSideError(
_("Swact action not allowed. Wait until the upgrade-activate "
"command completes"))
if upgrade.state == constants.UPGRADE_ABORTING:
if to_host_load_id == upgrade.to_load:
# Cannot swact to new load if aborting upgrade
raise wsme.exc.ClientSideError(
_("Aborting upgrade: %s must be using load %s before this "
"operation can proceed. Currently using load %s.") %
(to_host['hostname'], from_sw_version, to_host_sw_version))
elif upgrade.state == constants.UPGRADE_ABORTING_ROLLBACK:
if from_host['software_load'] == from_sw_version and to_host.software_load == to_sw_version:
raise wsme.exc.ClientSideError(_("Aborting upgrade: Unable to swact from %s to %s")
% (from_sw_version, to_sw_version))
elif to_host_load_id == upgrade.from_load:
# On AIO loads we must abort before we swact back to the old load
# Any VMs on the active controller will be lost during the swact
if constants.WORKER in to_host.subfunctions:
raise wsme.exc.ClientSideError(
_("Upgrading: %s must be using load %s before this "
"operation can proceed. Currently using load %s.") %
(to_host['hostname'], to_sw_version, to_host_sw_version))
def _semantic_check_swact_kube_rootca_update(self, ihost, force_swact=False):
"""

View File

@ -2537,3 +2537,10 @@ FILEPATH = "File Path"
AUTOMATIC = "Automatic"
MANUAL = "Manual"
ISSUER = "Issuer"
# USM deploy state
DEPLOY_STATE_HOST = 'host' # host is being deploy
DEPLOY_STATE_HOST_FAILED = 'host-failed' # host deployment failed
# USM deploy host state
DEPLOY_HOST_PENDING = 'pending' # host is pending for new deployment
DEPLOY_HOST_DEPLOYED = 'deployed' # new software is deployed to the host

View File

@ -36,12 +36,19 @@ class UsmUpgrade(object):
return not (self == other)
def get_software_upgrade(token, region_name, timeout=30):
def get_region_name(dbapi):
system = dbapi.isystem_get_one()
region_name = system.region_name
return region_name
if not token:
token = get_token(region_name)
endpoint = token.get_service_url("usm", "usm")
def get_usm_endpoint(token):
return token.get_service_url("usm", "usm")
def get_software_upgrade(region_name, timeout=30):
token = get_token(region_name)
endpoint = get_usm_endpoint(token)
if not endpoint:
return None
@ -52,6 +59,27 @@ def get_software_upgrade(token, region_name, timeout=30):
return response
def get_host_deploy(dbapi, hostname):
region_name = get_region_name(dbapi)
token = get_token(region_name)
endpoint = get_usm_endpoint(token)
if not endpoint:
return None
endpoint += "/v1/deploy_host"
hostlist = rest_api_request(token, "GET", endpoint, timeout=10)
if hostname is None:
return hostlist
for host in hostlist:
if host['hostname'] == hostname:
return host
return None
def get_platform_upgrade(dbapi, usm_only=False):
"""
Get upgrade object from either sysinv db or USM service.
@ -60,11 +88,10 @@ def get_platform_upgrade(dbapi, usm_only=False):
"""
upgrade = None
system = dbapi.isystem_get_one()
region_name = system.region_name
region_name = get_region_name(dbapi)
try:
response = get_software_upgrade(None, region_name)
response = get_software_upgrade(region_name)
if response:
upgrade = UsmUpgrade(state=response["state"],
from_load=response["from_release"],

View File

@ -4842,11 +4842,8 @@ class ConductorManager(service.PeriodicService):
# If they somehow don't match we've got bigger problems
return True
# TODO(bqian) this is to be replaced with host.sw_version after
# https://review.opendev.org/c/starlingx/config/+/915376
# in a USM upgrade scenario.
host_obj = self.dbapi.ihost_get(host_uuid)
host_version = host_obj.software_load
host_version = host_obj.sw_version
return host_version == tsc.SW_VERSION