Merge "Improved sw-deploy-strategy reentrancy"

This commit is contained in:
Zuul 2024-07-18 18:14:52 +00:00 committed by Gerrit Code Review
commit ff5449fd0a
5 changed files with 210 additions and 130 deletions

View File

@ -1046,13 +1046,17 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
Test the sw_deploy strategy when patch already deployed:
- patch already deployed
Verify:
- Fail
- Success
"""
_, strategy = self._gen_aiosx_hosts_and_strategy(
nfvi_upgrade=nfvi.objects.v1.Upgrade(
'13.01',
{'state': 'deployed'},
{
'state': 'deployed',
'reboot_required': False,
'sw_version': PATCH_RELEASE_UPGRADE,
},
None,
None,
)
@ -1062,12 +1066,10 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
strategy.sw_update_obj = fake_upgrade_obj
strategy.build_complete(common_strategy.STRATEGY_RESULT.SUCCESS, "")
expected_reason = "Software release is already deployed or committed."
bpr = strategy.build_phase
assert strategy._state == common_strategy.STRATEGY_STATE.BUILD_FAILED, strategy._state
assert bpr.result == common_strategy.STRATEGY_PHASE_RESULT.FAILED, bpr.result
assert bpr.result_reason == expected_reason, bpr.result_reason
assert strategy._state == common_strategy.STRATEGY_STATE.INITIAL, strategy._state
assert bpr.result == common_strategy.STRATEGY_PHASE_RESULT.INITIAL, bpr.result
def test_sw_deploy_strategy_aiosx_already_deploy_completed(self):
"""
@ -1118,7 +1120,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
strategy.sw_update_obj = fake_upgrade_obj
strategy.build_complete(common_strategy.STRATEGY_RESULT.SUCCESS, "")
expected_reason = "Software release is already deployed or committed."
expected_reason = "Software release is committed."
bpr = strategy.build_phase
assert strategy._state == common_strategy.STRATEGY_STATE.BUILD_FAILED

View File

@ -1816,7 +1816,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'total_stages': 0,
'result': 'failed',
'result_reason':
'all controller hosts must be unlocked-enabled-available'
"All hosts must be unlocked-enabled-available to start a new sw-deployment: ['controller-0']"
}
sw_update_testcase.validate_phase(build_phase, expected_results)
@ -1860,7 +1860,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'total_stages': 0,
'result': 'failed',
'result_reason':
'all worker hosts must be unlocked-enabled-available'
"All hosts must be unlocked-enabled-available to start a new sw-deployment: ['compute-3']"
}
sw_update_testcase.validate_phase(build_phase, expected_results)

View File

@ -59,11 +59,27 @@ class Upgrade(ObjectData):
return self.release_info["sw_version"]
@property
def major_release(self):
if not self.release_info:
def from_release(self):
if not self.deploy_info:
return None
return is_major_release(SW_VERSION, self.sw_version)
return self.deploy_info["from_release"]
@property
def to_release(self):
if not self.deploy_info:
return None
return self.deploy_info["to_release"]
@property
def major_release(self):
if self.deploy_info:
return is_major_release(self.from_release, self.to_release)
elif self.release_info:
# On DX systems, SW_VERSION will not be accurate if only one host has be deployed.
# Therefore, it should only be used when a deployment is not in progress.
return is_major_release(SW_VERSION, self.sw_version)
@property
def is_available(self):
@ -73,6 +89,10 @@ class Upgrade(ObjectData):
def is_unavailable(self):
return self.release_state == usm_states.UNAVAILABLE
@property
def is_deploying(self):
return self.release_state == usm_states.DEPLOYING
@property
def is_deployed(self):
return self.release_state == usm_states.DEPLOYED
@ -98,11 +118,11 @@ class Upgrade(ObjectData):
return self.deploy_state == usm_states.DEPLOY_STATES.HOST.value
@property
def is_deploy_hosts_done(self):
def is_deploying_hosts_done(self):
return self.deploy_state == usm_states.DEPLOY_STATES.HOST_DONE.value
@property
def is_deploy_hosts_failed(self):
def is_deploying_hosts_failed(self):
return self.deploy_state == usm_states.DEPLOY_STATES.HOST_FAILED.value
@property
@ -121,12 +141,10 @@ class Upgrade(ObjectData):
def is_deploy_completed(self):
return self.deploy_state == usm_states.DEPLOY_STATES.COMPLETED.value
@property
def all_hosts_deployed(self):
def is_host_deployed(self, hostname):
if not self.hosts_info:
return None
for v in self.hosts_info:
if v["host_state"] != usm_states.DEPLOY_HOST_STATES.DEPLOYED.value:
return False
return True
if v["hostname"] == hostname:
return v["host_state"] == usm_states.DEPLOY_HOST_STATES.DEPLOYED.value

View File

@ -1890,17 +1890,91 @@ class SwUpgradeStrategy(
stage = strategy.StrategyStage(strategy.STRATEGY_STAGE_NAME.SW_UPGRADE_START)
# If the release is not available the deployment is already started
if self.nfvi_upgrade.is_available:
stage.add_step(strategy.QueryAlarmsStep(True, ignore_alarms=self._ignore_alarms))
# sw-deploy start for major releases must be done on controller-0
self._swact_fix(stage, HOST_NAME.CONTROLLER_1)
stage.add_step(strategy.UpgradeStartStep(release=self._release))
else:
DLOG.info("Software deployment already inprogress, skipping start")
stage.add_step(strategy.QueryAlarmsStep(True, ignore_alarms=self._ignore_alarms))
# sw-deploy start for major releases must be done on controller-0
self._swact_fix(stage, HOST_NAME.CONTROLLER_1)
stage.add_step(strategy.UpgradeStartStep(release=self._release))
stage.add_step(strategy.SystemStabilizeStep(timeout_in_secs=MTCE_DELAY))
self.apply_phase.add_stage(stage)
def _add_upgrade_hosts_stages(self):
from nfv_vim import strategy
from nfv_vim import tables
host_table = tables.tables_get_host_table()
reboot_required = self.nfvi_upgrade.reboot_required
controller_strategy = self._add_controller_strategy_stages
controllers_hosts = list()
storage_hosts = list()
worker_hosts = list()
for host in host_table.values():
if self.nfvi_upgrade.is_host_deployed(host.name):
DLOG.info("Skipping deploy-host for already deployed host: {host.name}")
continue
if HOST_PERSONALITY.CONTROLLER in host.personality:
controllers_hosts.append(host)
if HOST_PERSONALITY.WORKER in host.personality:
# We need to use this strategy on AIO type
controller_strategy = self._add_worker_strategy_stages
elif HOST_PERSONALITY.STORAGE in host.personality:
storage_hosts.append(host)
elif HOST_PERSONALITY.WORKER in host.personality:
worker_hosts.append(host)
else:
DLOG.error(f"Unsupported personality for host {host.name}.")
self._state = strategy.STRATEGY_STATE.BUILD_FAILED
self.build_phase.result = \
strategy.STRATEGY_PHASE_RESULT.FAILED
self.build_phase.result_reason = \
'Unsupported personality for host'
self.sw_update_obj.strategy_build_complete(
False, self.build_phase.result_reason)
self.save()
return
if not self._single_controller and self.nfvi_upgrade.major_release:
# Reverse controller hosts so controller-1 is first
controllers_hosts = sorted(
controllers_hosts,
key=lambda x: x.name == HOST_NAME.CONTROLLER_0,
)
elif not self.nfvi_upgrade.major_release:
local_host_name = get_local_host_name()
# Sort the controller such that host other than
# current local_host_name is the first element in the list.
# This sorting is to reduce the number of swact required since
# sw-deploy patch release orchestration can start on host that
# is currently active.
controllers_hosts = sorted(
controllers_hosts,
key=lambda x: x.name == local_host_name,
)
strategy_pairs = [
(controller_strategy, controllers_hosts),
(self._add_storage_strategy_stages, storage_hosts),
(self._add_worker_strategy_stages, worker_hosts)
]
for stage_func, host_list in strategy_pairs:
if host_list:
success, reason = stage_func(host_list, reboot_required)
if not success:
self._state = strategy.STRATEGY_STATE.BUILD_FAILED
self.build_phase.result = \
strategy.STRATEGY_PHASE_RESULT.FAILED
self.build_phase.result_reason = reason
self.sw_update_obj.strategy_build_complete(
False, self.build_phase.result_reason)
self.save()
return
def _add_upgrade_complete_stage(self):
"""
Add upgrade complete strategy stage
@ -1917,7 +1991,6 @@ class SwUpgradeStrategy(
def _build_complete_normal(self, result, result_reason):
from nfv_vim import strategy
from nfv_vim import tables
reason = ""
result, result_reason = \
@ -1931,8 +2004,8 @@ class SwUpgradeStrategy(
if not self.nfvi_upgrade.release_info or self.nfvi_upgrade.is_unavailable:
reason = "Software release does not exist or is unavailable."
elif self.nfvi_upgrade.is_deployed or self.nfvi_upgrade.is_committed:
reason = "Software release is already deployed or committed."
elif self.nfvi_upgrade.is_committed:
reason = "Software release is committed."
elif self.nfvi_upgrade.is_deploy_completed:
reason = "Software deployment is already complete."
@ -1940,6 +2013,19 @@ class SwUpgradeStrategy(
elif self._nfvi_alarms:
reason = "Active alarms found, can't apply software deployment."
elif self.nfvi_upgrade.is_deployed or self.nfvi_upgrade.is_available:
from nfv_vim import tables
bad_hosts = []
host_table = tables.tables_get_host_table()
for host in list(host_table.values()):
if not (host.is_unlocked() and host.is_enabled() and host.is_available()):
bad_hosts.append(host.name)
if bad_hosts:
reason = (
"All hosts must be unlocked-enabled-available " +
f"to start a new sw-deployment: {bad_hosts}"
)
if reason:
DLOG.warn(reason)
self._state = strategy.STRATEGY_STATE.BUILD_FAILED
@ -1950,97 +2036,46 @@ class SwUpgradeStrategy(
self.save()
return
host_table = tables.tables_get_host_table()
do_start = True
do_upgrade_hosts = True
do_complete = True
for host in list(host_table.values()):
# All hosts must be unlock/enabled/available
if not (host.is_unlocked() and host.is_enabled() and host.is_available()):
DLOG.warn(
"All hosts must be unlocked-enabled-available, "
"can't apply sw-deployment: %s" % host.name)
self._state = strategy.STRATEGY_STATE.BUILD_FAILED
self.build_phase.result = \
strategy.STRATEGY_PHASE_RESULT.FAILED
self.build_phase.result_reason = (
'all %s hosts must be unlocked-enabled-available' %
host.personality)
self.sw_update_obj.strategy_build_complete(
False, self.build_phase.result_reason)
self.save()
return
# Skip start if started
if (
self.nfvi_upgrade.is_start_done or
self.nfvi_upgrade.is_deploying_hosts or
self.nfvi_upgrade.is_deploying_hosts_failed
):
DLOG.info("Skipping sw-deploy-start: Already started")
do_start = False
reboot_required = self.nfvi_upgrade.reboot_required
controller_strategy = self._add_controller_strategy_stages
controllers_hosts = list()
storage_hosts = list()
worker_hosts = list()
# Skip straight to end if activate is complete or failed
elif (
self.nfvi_upgrade.is_deploying_hosts_done or
self.nfvi_upgrade.is_activating or
self.nfvi_upgrade.is_activate_done or
self.nfvi_upgrade.is_activate_failed
):
DLOG.info("Skipping sw-deploy-start: Already started")
DLOG.info("Skipping sw-deploy-hosts: Already deployed hosts")
do_start = False
do_upgrade_hosts = False
self._add_upgrade_start_stage()
# Already done, strategy probably would have failed already but just in case
elif self.nfvi_upgrade.is_deploy_completed:
DLOG.info("Doing nothing sw-deploy already completed")
do_start = False
do_upgrade_hosts = False
do_complete = False
for host in host_table.values():
if HOST_PERSONALITY.CONTROLLER in host.personality:
controllers_hosts.append(host)
if HOST_PERSONALITY.WORKER in host.personality:
# We need to use this strategy on AIO type
controller_strategy = self._add_worker_strategy_stages
if do_start:
self._add_upgrade_start_stage()
elif HOST_PERSONALITY.STORAGE in host.personality:
storage_hosts.append(host)
if do_upgrade_hosts:
self._add_upgrade_hosts_stages()
elif HOST_PERSONALITY.WORKER in host.personality:
worker_hosts.append(host)
else:
DLOG.error(f"Unsupported personality for host {host.name}.")
self._state = strategy.STRATEGY_STATE.BUILD_FAILED
self.build_phase.result = \
strategy.STRATEGY_PHASE_RESULT.FAILED
self.build_phase.result_reason = \
'Unsupported personality for host'
self.sw_update_obj.strategy_build_complete(
False, self.build_phase.result_reason)
self.save()
return
if not self._single_controller and self.nfvi_upgrade.major_release:
# Reverse controller hosts so controller-1 is first
controllers_hosts = sorted(
controllers_hosts,
key=lambda x: x.name == HOST_NAME.CONTROLLER_0,
)
elif not self.nfvi_upgrade.major_release:
local_host_name = get_local_host_name()
# Sort the controller such that host other than
# current local_host_name is the first element in the list.
# This sorting is to reduce the number of swact required since
# sw-deploy patch release orchestration can start on host that
# is currently active.
controllers_hosts = sorted(
controllers_hosts,
key=lambda x: x.name == local_host_name,
)
strategy_pairs = [
(controller_strategy, controllers_hosts),
(self._add_storage_strategy_stages, storage_hosts),
(self._add_worker_strategy_stages, worker_hosts)
]
for stage_func, host_list in strategy_pairs:
if host_list:
success, reason = stage_func(host_list, reboot_required)
if not success:
self._state = strategy.STRATEGY_STATE.BUILD_FAILED
self.build_phase.result = \
strategy.STRATEGY_PHASE_RESULT.FAILED
self.build_phase.result_reason = reason
self.sw_update_obj.strategy_build_complete(
False, self.build_phase.result_reason)
self.save()
return
self._add_upgrade_complete_stage()
if do_complete:
self._add_upgrade_complete_stage()
if 0 == len(self.apply_phase.stages):
DLOG.warn("No sw-deployments need to be applied.")

View File

@ -943,7 +943,7 @@ class SwDeployPrecheckStep(strategy.StrategyStep):
response = (yield)
DLOG.debug("sw-deploy precheck callback response=%s." % response)
if response['completed'] and response['result-data']:
if response['completed'] and response['complete-data'].get('system_healthy', False):
DLOG.debug("sw-deploy precheck completed")
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
self.stage.step_complete(result, '')
@ -964,11 +964,16 @@ class SwDeployPrecheckStep(strategy.StrategyStep):
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
force = (
self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED
)
nfvi.nfvi_sw_deploy_precheck(self._release, force, self._sw_deploy_precheck_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
if self.strategy.nfvi_upgrade.is_deploying:
reason = "Deployment already in progress, skipping precheck"
DLOG.info(reason)
return strategy.STRATEGY_STEP_RESULT.SUCCESS, reason
else:
force = (
self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED
)
nfvi.nfvi_sw_deploy_precheck(self._release, force, self._sw_deploy_precheck_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
def from_dict(self, data):
"""
@ -1179,11 +1184,22 @@ class UpgradeStartStep(strategy.StrategyStep):
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
force = (
self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED
)
nfvi.nfvi_upgrade_start(self._release, force, self._start_upgrade_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
result = strategy.STRATEGY_STEP_RESULT.WAIT
reason = ""
if self.strategy.nfvi_upgrade.is_starting:
DLOG.info("Deployment already starting, skipping start call")
elif self.strategy.nfvi_upgrade.is_start_done:
DLOG.info("Deployment already started, skipping start call")
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
else:
force = (
self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED
)
nfvi.nfvi_upgrade_start(self._release, force, self._start_upgrade_callback())
return result, reason
def handle_event(self, event, event_data=None):
"""
@ -1253,8 +1269,6 @@ class UpgradeActivateStep(strategy.StrategyStep):
self.phase.result_complete_response(detailed_reason)
self.stage.step_complete(result, reason)
# TODO(jkraitbe): This will change in future
@coroutine
def _handle_activate_upgrade_callback(self):
"""
@ -1308,8 +1322,19 @@ class UpgradeActivateStep(strategy.StrategyStep):
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_upgrade_activate(self._release, self._activate_upgrade_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
result = strategy.STRATEGY_STEP_RESULT.WAIT
reason = ""
if self.strategy.nfvi_upgrade.is_activating:
DLOG.info("Deployment already activating, skipping activate call")
elif self.strategy.nfvi_upgrade.is_activate_done:
DLOG.info("Deployment already activated, skipping activate call")
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
else:
nfvi.nfvi_upgrade_activate(self._release, self._activate_upgrade_callback())
return result, reason
def handle_event(self, event, event_data=None):
"""