Serialize VM starts after AIO-SX unlock
When an AIO-SX is locked, the VIM shuts down all the instances. When the AIO-SX is unlocked, the VIM then starts all the instances, but does this in parallel. In some configurations, this causes too much nova/neutron activity, which can cause various issues and some of the instances may fail to start. This commit changes the VIM to only start one instance at a time. Once the instance goes active/running, the VIM will then move on to the next instance. If an instance fails to start within 60s, we move on to the next instance anyways. Note that this change only impacts lock/unlock on AIO-SX - it does not change the behaviour for starting instances in parallel on compute hosts (e.g. as part of patch orchestration). Change-Id: I203d1ac5911e4dae08b8deefbec326cea0a819b5
This commit is contained in:
parent
1370e9d948
commit
1ec03d168e
|
@ -1 +1 @@
|
|||
TIS_PATCH_VER=70
|
||||
TIS_PATCH_VER=71
|
||||
|
|
|
@ -40,7 +40,7 @@ def _analysis_instances_success(instance_uuid, instance_name, records,
|
|||
|
||||
elif record_data['type'] \
|
||||
in [NFV_VIM.INSTANCE_START_STATE,
|
||||
NFV_VIM.INSTANCE_START_STATE_COMPLETED,
|
||||
NFV_VIM.INSTANCE_START_STATE_INPROGRESS,
|
||||
NFV_VIM.INSTANCE_STOP_STATE,
|
||||
NFV_VIM.INSTANCE_STOP_STATE_COMPLETED,
|
||||
NFV_VIM.INSTANCE_PAUSE_STATE,
|
||||
|
@ -190,7 +190,7 @@ def analysis_instance_start_success(instance_uuid, instance_name, records,
|
|||
= [(action, NFV_VIM.INSTANCE_NFVI_ACTION_START),
|
||||
(always, NFV_VIM.INSTANCE_START_STATE),
|
||||
(always, NFV_VIM.INSTANCE_START_CALLBACK),
|
||||
(always, NFV_VIM.INSTANCE_START_STATE_COMPLETED),
|
||||
(always, NFV_VIM.INSTANCE_START_STATE_INPROGRESS),
|
||||
(always, NFV_VIM.INSTANCE_INITIAL_STATE)]
|
||||
|
||||
expected_records = list()
|
||||
|
|
|
@ -43,7 +43,7 @@ class _NfvVimRecordType(object):
|
|||
INSTANCE_COLD_MIGRATE_REVERT_STATE_TIMED_OUT = 'nfv_vim_instance_cold_migrate_revert_state_timed_out'
|
||||
INSTANCE_EVACUATE_STATE = 'nfv_vim_instance_evacuate_state'
|
||||
INSTANCE_START_STATE = 'nfv_vim_instance_start_state'
|
||||
INSTANCE_START_STATE_COMPLETED = 'nfv_vim_instance_start_state_completed'
|
||||
INSTANCE_START_STATE_INPROGRESS = 'nfv_vim_instance_start_state_inprogress'
|
||||
INSTANCE_START_STATE_FAILED = 'nfv_vim_instance_start_state_failed'
|
||||
INSTANCE_START_STATE_TIMED_OUT = 'nfv_vim_instance_start_state_timed_out'
|
||||
INSTANCE_STOP_STATE = 'nfv_vim_instance_stop_state'
|
||||
|
|
|
@ -260,10 +260,10 @@ logs:
|
|||
fields:
|
||||
- instance_name
|
||||
|
||||
- name: Instance Start State Completed
|
||||
type: nfv_vim_instance_start_state_completed
|
||||
- name: Instance Start State Inprogress
|
||||
type: nfv_vim_instance_start_state_inprogress
|
||||
file: _instance_state_start.py
|
||||
regex: "Start completed for (.*)."
|
||||
regex: "Start inprogress for (.*)."
|
||||
fields:
|
||||
- instance_name
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ class OperationTypes(Constants):
|
|||
UPGRADE_HOSTS = Constant('upgrade-hosts')
|
||||
SWACT_HOSTS = Constant('swact-hosts')
|
||||
START_INSTANCES = Constant('start-instances')
|
||||
START_INSTANCES_SERIAL = Constant('start-instances-serial')
|
||||
STOP_INSTANCES = Constant('stop-instances')
|
||||
MIGRATE_INSTANCES = Constant('migrate-instances')
|
||||
DISABLE_HOST_SERVICES = Constant('disable-host-services')
|
||||
|
@ -144,6 +145,13 @@ class Operation(object):
|
|||
"""
|
||||
return instance_uuid in self._instances
|
||||
|
||||
def instance_ready(self, instance_uuid):
|
||||
"""
|
||||
Returns true if instance exists and is in the READY state.
|
||||
"""
|
||||
return instance_uuid in self._instances and \
|
||||
OPERATION_STATE.READY == self._instances[instance_uuid]
|
||||
|
||||
def add_instance(self, instance_uuid, operation_state):
|
||||
"""
|
||||
Add the instance
|
||||
|
@ -197,7 +205,9 @@ class Operation(object):
|
|||
Returns true if the operation is inprogress
|
||||
"""
|
||||
return (OPERATION_STATE.INPROGRESS in self._hosts.values() or
|
||||
OPERATION_STATE.INPROGRESS in self._instances.values())
|
||||
OPERATION_STATE.READY in self._hosts.values() or
|
||||
OPERATION_STATE.INPROGRESS in self._instances.values() or
|
||||
OPERATION_STATE.READY in self._instances.values())
|
||||
|
||||
def is_failed(self):
|
||||
"""
|
||||
|
|
|
@ -750,7 +750,9 @@ class InstanceDirector(object):
|
|||
"""
|
||||
Host Start Instances
|
||||
"""
|
||||
if OPERATION_TYPE.START_INSTANCES != host_operation.operation_type:
|
||||
if host_operation.operation_type not in [
|
||||
OPERATION_TYPE.START_INSTANCES,
|
||||
OPERATION_TYPE.START_INSTANCES_SERIAL]:
|
||||
reason = ("Unsupported operation (%s) against host %s."
|
||||
% (host_operation.operation_type, host.name))
|
||||
DLOG.info(reason)
|
||||
|
@ -760,9 +762,14 @@ class InstanceDirector(object):
|
|||
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
|
||||
if OPERATION_TYPE.START_INSTANCES == host_operation.operation_type:
|
||||
reason = "start instances issued"
|
||||
elif OPERATION_TYPE.START_INSTANCES_SERIAL == \
|
||||
host_operation.operation_type:
|
||||
reason = "start instances serial issued"
|
||||
else:
|
||||
reason = None
|
||||
|
||||
starts_inprogress = 0
|
||||
|
||||
instance_table = tables.tables_get_instance_table()
|
||||
for instance in instance_table.on_host(host.name):
|
||||
if instance.uuid not in instance_uuids:
|
||||
|
@ -836,10 +843,19 @@ class InstanceDirector(object):
|
|||
host_operation.update_failure_reason(reason)
|
||||
return
|
||||
|
||||
host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS)
|
||||
|
||||
instance.do_action(objects.INSTANCE_ACTION_TYPE.START,
|
||||
initiated_by=initiated_by, reason=reason)
|
||||
if OPERATION_TYPE.START_INSTANCES_SERIAL == \
|
||||
host_operation.operation_type and starts_inprogress >= 1:
|
||||
# When starting instances in serial, the first instance is
|
||||
# started and the rest are set to the READY state, to be
|
||||
# started later.
|
||||
host_operation.add_instance(instance.uuid,
|
||||
OPERATION_STATE.READY)
|
||||
else:
|
||||
host_operation.add_instance(instance.uuid,
|
||||
OPERATION_STATE.INPROGRESS)
|
||||
instance.do_action(objects.INSTANCE_ACTION_TYPE.START,
|
||||
initiated_by=initiated_by, reason=reason)
|
||||
starts_inprogress += 1
|
||||
|
||||
def instance_migrate_complete(self, instance, from_host_name, failed=False,
|
||||
timed_out=False, cancelled=False):
|
||||
|
@ -1098,7 +1114,9 @@ class InstanceDirector(object):
|
|||
DLOG.verbose("No host %s operation inprogress." % on_host_name)
|
||||
return
|
||||
|
||||
if host_operation.operation_type not in [OPERATION_TYPE.START_INSTANCES]:
|
||||
if host_operation.operation_type not in [
|
||||
OPERATION_TYPE.START_INSTANCES,
|
||||
OPERATION_TYPE.START_INSTANCES_SERIAL]:
|
||||
DLOG.verbose("Unexpected host %s operation %s, ignoring."
|
||||
% (on_host_name, host_operation.operation_type))
|
||||
return
|
||||
|
@ -1137,10 +1155,22 @@ class InstanceDirector(object):
|
|||
|
||||
if OPERATION_STATE.COMPLETED != host_operation_state:
|
||||
host_operation.update_failure_reason(reason)
|
||||
host_operation = self._host_operations.get(host.name, None)
|
||||
if host_operation is not None:
|
||||
del self._host_operations[host.name]
|
||||
return
|
||||
|
||||
if OPERATION_TYPE.START_INSTANCES_SERIAL == \
|
||||
host_operation.operation_type:
|
||||
# Check if there is another instance on this host ready to start.
|
||||
# We continue starting instances even if the previous instance
|
||||
# failed to start.
|
||||
instance_table = tables.tables_get_instance_table()
|
||||
for instance in instance_table.on_host(host.name):
|
||||
if host_operation.instance_ready(instance.uuid):
|
||||
host_operation.update_instance(instance.uuid,
|
||||
OPERATION_STATE.INPROGRESS)
|
||||
instance.do_action(
|
||||
objects.INSTANCE_ACTION_TYPE.START,
|
||||
initiated_by=objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR,
|
||||
reason="start instances serial issued")
|
||||
return
|
||||
|
||||
# Check if host operation is complete
|
||||
if not host_operation.is_inprogress():
|
||||
|
@ -1789,7 +1819,7 @@ class InstanceDirector(object):
|
|||
instance.unlock_to_recover = False
|
||||
|
||||
if instance_uuids:
|
||||
self.start_instances(instance_uuids)
|
||||
self.start_instances(instance_uuids, serial=True)
|
||||
|
||||
# Do not attempt to do the unlock again.
|
||||
open(NFV_VIM_UNLOCK_COMPLETE_FILE, 'w').close()
|
||||
|
@ -1963,7 +1993,7 @@ class InstanceDirector(object):
|
|||
|
||||
return overall_operation
|
||||
|
||||
def start_instances(self, instance_uuids):
|
||||
def start_instances(self, instance_uuids, serial=False):
|
||||
"""
|
||||
Start Instances
|
||||
"""
|
||||
|
@ -1972,7 +2002,12 @@ class InstanceDirector(object):
|
|||
host_table = tables.tables_get_host_table()
|
||||
instance_table = tables.tables_get_instance_table()
|
||||
|
||||
overall_operation = Operation(OPERATION_TYPE.START_INSTANCES)
|
||||
if serial:
|
||||
operation_type = OPERATION_TYPE.START_INSTANCES_SERIAL
|
||||
else:
|
||||
operation_type = OPERATION_TYPE.START_INSTANCES
|
||||
|
||||
overall_operation = Operation(operation_type)
|
||||
|
||||
host_operations = dict()
|
||||
for instance_uuid in instance_uuids:
|
||||
|
@ -2004,7 +2039,7 @@ class InstanceDirector(object):
|
|||
|
||||
host_operation = host_operations.get(instance.host_name, None)
|
||||
if host_operation is None:
|
||||
host_operation = Operation(OPERATION_TYPE.START_INSTANCES)
|
||||
host_operation = Operation(operation_type)
|
||||
host_operations[instance.host_name] = host_operation
|
||||
|
||||
for host_name, host_operation in host_operations.iteritems():
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#
|
||||
from nfv_common import debug
|
||||
from nfv_common import state_machine
|
||||
from nfv_common import timers
|
||||
|
||||
from _instance_defs import INSTANCE_STATE, INSTANCE_EVENT
|
||||
from _instance_tasks import StartTask
|
||||
|
@ -24,6 +25,7 @@ class StartState(state_machine.State):
|
|||
Entering start state
|
||||
"""
|
||||
DLOG.info("Entering state (%s) for %s." % (self.name, instance.name))
|
||||
instance.action_fsm.wait_time = 0
|
||||
instance.task = StartTask(instance)
|
||||
instance.task.start()
|
||||
|
||||
|
@ -64,9 +66,9 @@ class StartState(state_machine.State):
|
|||
return INSTANCE_STATE.INITIAL
|
||||
|
||||
elif INSTANCE_EVENT.TASK_COMPLETED == event:
|
||||
DLOG.debug("Start completed for %s." % instance.name)
|
||||
instance_director.instance_start_complete(instance, instance.host_name)
|
||||
return INSTANCE_STATE.INITIAL
|
||||
DLOG.debug("Start inprogress for %s." % instance.name)
|
||||
instance.action_fsm.wait_time = \
|
||||
timers.get_monotonic_timestamp_in_ms()
|
||||
|
||||
elif INSTANCE_EVENT.TASK_FAILED == event:
|
||||
DLOG.info("Start failed for %s." % instance.name)
|
||||
|
@ -82,6 +84,32 @@ class StartState(state_machine.State):
|
|||
failed=False, timed_out=True)
|
||||
return INSTANCE_STATE.INITIAL
|
||||
|
||||
elif INSTANCE_EVENT.NFVI_ENABLED == event:
|
||||
instance_director.instance_start_complete(instance,
|
||||
instance.host_name)
|
||||
return INSTANCE_STATE.INITIAL
|
||||
|
||||
elif INSTANCE_EVENT.AUDIT == event:
|
||||
if not instance.task.inprogress():
|
||||
if instance.is_enabled():
|
||||
instance_director.instance_start_complete(
|
||||
instance, instance.host_name)
|
||||
return INSTANCE_STATE.INITIAL
|
||||
else:
|
||||
now_ms = timers.get_monotonic_timestamp_in_ms()
|
||||
secs_expired = \
|
||||
(now_ms - instance.action_fsm.wait_time) / 1000
|
||||
# Only wait 60 seconds for the instance to start.
|
||||
if 60 <= secs_expired:
|
||||
instance.fail_action(instance.action_fsm_action_type,
|
||||
'timeout')
|
||||
instance_director.instance_start_complete(
|
||||
instance,
|
||||
instance.host_name,
|
||||
failed=False,
|
||||
timed_out=True)
|
||||
return INSTANCE_STATE.INITIAL
|
||||
|
||||
else:
|
||||
DLOG.verbose("Ignoring %s event for %s." % (event, instance.name))
|
||||
|
||||
|
|
Loading…
Reference in New Issue