Increase timeout to wait for alarms to clear.

This handles the case when OSDs are deployed on controller nodes and
patching fails after a controller is unlocked while there is an ongoing
HEALTH_WARN from ceph.

Closes-Bug: 1907259
Signed-off-by: Andrei Grosu <andrei.grosu@windriver.com>
Change-Id: Ibc71987049bc1040ca2c3c8db72bbac74cb35457
This commit is contained in:
Andrei Grosu 2021-01-21 13:26:50 -05:00
parent 6642d504ae
commit 9b79211a3c
2 changed files with 61 additions and 50 deletions

View File

@ -2962,7 +2962,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 600},
'timeout': 1800},
]
},
{'name': 'sw-patch-controllers',
@ -2980,7 +2980,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 600},
'timeout': 1800},
]
},
]
@ -3071,8 +3071,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'timeout': 15},
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'system-stabilize',
'timeout': 60},
{'name': 'wait-alarms-clear',
'timeout': 1800},
]
},
{'name': 'sw-patch-controllers',
@ -3089,8 +3089,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'timeout': 15},
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'system-stabilize',
'timeout': 60},
{'name': 'wait-alarms-clear',
'timeout': 1800},
]
},
]
@ -3196,7 +3196,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 600},
'timeout': 1800},
]
},
{'name': 'sw-patch-worker-hosts',
@ -3218,7 +3218,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
]
@ -3321,7 +3321,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3343,7 +3343,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
]
@ -3398,7 +3398,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3416,7 +3416,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
]
@ -3470,8 +3470,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'timeout': 15},
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'system-stabilize',
'timeout': 60}
{'name': 'wait-alarms-clear',
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3488,8 +3488,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'timeout': 15},
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'system-stabilize',
'timeout': 60}
{'name': 'wait-alarms-clear',
'timeout': 1800}
]
},
]
@ -3566,7 +3566,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 600},
'timeout': 1800},
]
},
{'name': 'sw-patch-worker-hosts',
@ -3588,7 +3588,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3730,7 +3730,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3752,7 +3752,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3850,7 +3850,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3868,7 +3868,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3985,8 +3985,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'timeout': 15},
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'system-stabilize',
'timeout': 60},
{'name': 'wait-alarms-clear',
'timeout': 1800},
]
},
]
@ -4044,7 +4044,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 600},
'timeout': 1800},
]
},
]
@ -4095,7 +4095,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 600}
'timeout': 1800}
]
},
]

View File

@ -610,14 +610,14 @@ class SwPatchStrategy(SwUpdateStrategy):
stage.add_step(strategy.SystemStabilizeStep(
timeout_in_secs=MTCE_DELAY))
stage.add_step(strategy.UnlockHostsStep(host_list))
if host.openstack_control:
# Wait extra time for services to go enabled and
# alarms to clear.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=10 * 60,
ignore_alarms=self._ignore_alarms))
else:
stage.add_step(strategy.SystemStabilizeStep())
# After controller node(s) are unlocked, we need extra time to
# allow the OSDs to go back in sync and the storage related
# alarms to clear. Note: not all controller nodes will have
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the patch strategy.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
else:
# Less time required if host is not rebooting
stage.add_step(strategy.SystemStabilizeStep(
@ -639,14 +639,14 @@ class SwPatchStrategy(SwUpdateStrategy):
stage.add_step(strategy.SystemStabilizeStep(
timeout_in_secs=MTCE_DELAY))
stage.add_step(strategy.UnlockHostsStep(host_list))
if host.openstack_control:
# Wait extra time for services to go enabled and
# alarms to clear.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=10 * 60,
ignore_alarms=self._ignore_alarms))
else:
stage.add_step(strategy.SystemStabilizeStep())
# After controller node(s) are unlocked, we need extra time to
# allow the OSDs to go back in sync and the storage related
# alarms to clear. Note: not all controller nodes will have
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the patch strategy.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
else:
# Less time required if host is not rebooting
stage.add_step(strategy.SystemStabilizeStep(
@ -860,18 +860,29 @@ class SwPatchStrategy(SwUpdateStrategy):
self._default_instance_action:
stage.add_step(strategy.StartInstancesStep(
instance_list))
if any(host.openstack_control or host.openstack_compute for host in hosts_to_lock) or \
any(host.openstack_control or host.openstack_compute for host in hosts_to_reboot):
# Wait extra time for services to go enabled
# and alarms to clear.
# After controller node(s) are unlocked, we need extra time to
# allow the OSDs to go back in sync and the storage related
# alarms to clear. Note: not all controller nodes will have
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the patch strategy.
if any([HOST_PERSONALITY.CONTROLLER in host.personality
for host in hosts_to_lock + hosts_to_reboot]):
# Multiple personality nodes that need to wait for OSDs to sync:
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=10 * 60,
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
else:
stage.add_step(strategy.SystemStabilizeStep())
if any([host.openstack_control or host.openstack_compute
for host in hosts_to_lock + hosts_to_reboot]):
# Hosts with openstack that just need to wait for services to start up:
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=10 * 60,
ignore_alarms=self._ignore_alarms))
else:
# Worker host wihout multiple personalities or openstack:
stage.add_step(strategy.SystemStabilizeStep())
else:
# Less time required if host is not rebooting
# Less time required if host is not rebooting:
stage.add_step(strategy.SystemStabilizeStep(
timeout_in_secs=NO_REBOOT_DELAY))
self.apply_phase.add_stage(stage)