Serialize VM starts after AIO-SX unlock

When an AIO-SX is locked, the VIM shuts down all the instances.
When the AIO-SX is unlocked, the VIM then starts all the
instances, but does this in parallel. In some configurations, this
causes too much nova/neutron activity, which can cause various
issues and some of the instances may fail to start.

This commit changes the VIM to only start one instance at a time.
Once the instance goes active/running, the VIM will then move on to
the next instance. If an instance fails to start within 60s, we
move on to the next instance anyways. Note that this change only
impacts lock/unlock on AIO-SX - it does not change the behaviour
for starting instances in parallel on compute hosts (e.g. as part
of patch orchestration).

Change-Id: I203d1ac5911e4dae08b8deefbec326cea0a819b5
This commit is contained in:
Bart Wensley 2018-04-11 08:00:32 -05:00 committed by Al Bailey
parent 1370e9d948
commit 1ec03d168e
7 changed files with 98 additions and 25 deletions

View File

@ -1 +1 @@
TIS_PATCH_VER=70
TIS_PATCH_VER=71

View File

@ -40,7 +40,7 @@ def _analysis_instances_success(instance_uuid, instance_name, records,
elif record_data['type'] \
in [NFV_VIM.INSTANCE_START_STATE,
NFV_VIM.INSTANCE_START_STATE_COMPLETED,
NFV_VIM.INSTANCE_START_STATE_INPROGRESS,
NFV_VIM.INSTANCE_STOP_STATE,
NFV_VIM.INSTANCE_STOP_STATE_COMPLETED,
NFV_VIM.INSTANCE_PAUSE_STATE,
@ -190,7 +190,7 @@ def analysis_instance_start_success(instance_uuid, instance_name, records,
= [(action, NFV_VIM.INSTANCE_NFVI_ACTION_START),
(always, NFV_VIM.INSTANCE_START_STATE),
(always, NFV_VIM.INSTANCE_START_CALLBACK),
(always, NFV_VIM.INSTANCE_START_STATE_COMPLETED),
(always, NFV_VIM.INSTANCE_START_STATE_INPROGRESS),
(always, NFV_VIM.INSTANCE_INITIAL_STATE)]
expected_records = list()

View File

@ -43,7 +43,7 @@ class _NfvVimRecordType(object):
INSTANCE_COLD_MIGRATE_REVERT_STATE_TIMED_OUT = 'nfv_vim_instance_cold_migrate_revert_state_timed_out'
INSTANCE_EVACUATE_STATE = 'nfv_vim_instance_evacuate_state'
INSTANCE_START_STATE = 'nfv_vim_instance_start_state'
INSTANCE_START_STATE_COMPLETED = 'nfv_vim_instance_start_state_completed'
INSTANCE_START_STATE_INPROGRESS = 'nfv_vim_instance_start_state_inprogress'
INSTANCE_START_STATE_FAILED = 'nfv_vim_instance_start_state_failed'
INSTANCE_START_STATE_TIMED_OUT = 'nfv_vim_instance_start_state_timed_out'
INSTANCE_STOP_STATE = 'nfv_vim_instance_stop_state'

View File

@ -260,10 +260,10 @@ logs:
fields:
- instance_name
- name: Instance Start State Completed
type: nfv_vim_instance_start_state_completed
- name: Instance Start State Inprogress
type: nfv_vim_instance_start_state_inprogress
file: _instance_state_start.py
regex: "Start completed for (.*)."
regex: "Start inprogress for (.*)."
fields:
- instance_name

View File

@ -24,6 +24,7 @@ class OperationTypes(Constants):
UPGRADE_HOSTS = Constant('upgrade-hosts')
SWACT_HOSTS = Constant('swact-hosts')
START_INSTANCES = Constant('start-instances')
START_INSTANCES_SERIAL = Constant('start-instances-serial')
STOP_INSTANCES = Constant('stop-instances')
MIGRATE_INSTANCES = Constant('migrate-instances')
DISABLE_HOST_SERVICES = Constant('disable-host-services')
@ -144,6 +145,13 @@ class Operation(object):
"""
return instance_uuid in self._instances
def instance_ready(self, instance_uuid):
"""
Returns true if instance exists and is in the READY state.
"""
return instance_uuid in self._instances and \
OPERATION_STATE.READY == self._instances[instance_uuid]
def add_instance(self, instance_uuid, operation_state):
"""
Add the instance
@ -197,7 +205,9 @@ class Operation(object):
Returns true if the operation is inprogress
"""
return (OPERATION_STATE.INPROGRESS in self._hosts.values() or
OPERATION_STATE.INPROGRESS in self._instances.values())
OPERATION_STATE.READY in self._hosts.values() or
OPERATION_STATE.INPROGRESS in self._instances.values() or
OPERATION_STATE.READY in self._instances.values())
def is_failed(self):
"""

View File

@ -750,7 +750,9 @@ class InstanceDirector(object):
"""
Host Start Instances
"""
if OPERATION_TYPE.START_INSTANCES != host_operation.operation_type:
if host_operation.operation_type not in [
OPERATION_TYPE.START_INSTANCES,
OPERATION_TYPE.START_INSTANCES_SERIAL]:
reason = ("Unsupported operation (%s) against host %s."
% (host_operation.operation_type, host.name))
DLOG.info(reason)
@ -760,9 +762,14 @@ class InstanceDirector(object):
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
if OPERATION_TYPE.START_INSTANCES == host_operation.operation_type:
reason = "start instances issued"
elif OPERATION_TYPE.START_INSTANCES_SERIAL == \
host_operation.operation_type:
reason = "start instances serial issued"
else:
reason = None
starts_inprogress = 0
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.uuid not in instance_uuids:
@ -836,10 +843,19 @@ class InstanceDirector(object):
host_operation.update_failure_reason(reason)
return
host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS)
instance.do_action(objects.INSTANCE_ACTION_TYPE.START,
initiated_by=initiated_by, reason=reason)
if OPERATION_TYPE.START_INSTANCES_SERIAL == \
host_operation.operation_type and starts_inprogress >= 1:
# When starting instances in serial, the first instance is
# started and the rest are set to the READY state, to be
# started later.
host_operation.add_instance(instance.uuid,
OPERATION_STATE.READY)
else:
host_operation.add_instance(instance.uuid,
OPERATION_STATE.INPROGRESS)
instance.do_action(objects.INSTANCE_ACTION_TYPE.START,
initiated_by=initiated_by, reason=reason)
starts_inprogress += 1
def instance_migrate_complete(self, instance, from_host_name, failed=False,
timed_out=False, cancelled=False):
@ -1098,7 +1114,9 @@ class InstanceDirector(object):
DLOG.verbose("No host %s operation inprogress." % on_host_name)
return
if host_operation.operation_type not in [OPERATION_TYPE.START_INSTANCES]:
if host_operation.operation_type not in [
OPERATION_TYPE.START_INSTANCES,
OPERATION_TYPE.START_INSTANCES_SERIAL]:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (on_host_name, host_operation.operation_type))
return
@ -1137,10 +1155,22 @@ class InstanceDirector(object):
if OPERATION_STATE.COMPLETED != host_operation_state:
host_operation.update_failure_reason(reason)
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
del self._host_operations[host.name]
return
if OPERATION_TYPE.START_INSTANCES_SERIAL == \
host_operation.operation_type:
# Check if there is another instance on this host ready to start.
# We continue starting instances even if the previous instance
# failed to start.
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if host_operation.instance_ready(instance.uuid):
host_operation.update_instance(instance.uuid,
OPERATION_STATE.INPROGRESS)
instance.do_action(
objects.INSTANCE_ACTION_TYPE.START,
initiated_by=objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR,
reason="start instances serial issued")
return
# Check if host operation is complete
if not host_operation.is_inprogress():
@ -1789,7 +1819,7 @@ class InstanceDirector(object):
instance.unlock_to_recover = False
if instance_uuids:
self.start_instances(instance_uuids)
self.start_instances(instance_uuids, serial=True)
# Do not attempt to do the unlock again.
open(NFV_VIM_UNLOCK_COMPLETE_FILE, 'w').close()
@ -1963,7 +1993,7 @@ class InstanceDirector(object):
return overall_operation
def start_instances(self, instance_uuids):
def start_instances(self, instance_uuids, serial=False):
"""
Start Instances
"""
@ -1972,7 +2002,12 @@ class InstanceDirector(object):
host_table = tables.tables_get_host_table()
instance_table = tables.tables_get_instance_table()
overall_operation = Operation(OPERATION_TYPE.START_INSTANCES)
if serial:
operation_type = OPERATION_TYPE.START_INSTANCES_SERIAL
else:
operation_type = OPERATION_TYPE.START_INSTANCES
overall_operation = Operation(operation_type)
host_operations = dict()
for instance_uuid in instance_uuids:
@ -2004,7 +2039,7 @@ class InstanceDirector(object):
host_operation = host_operations.get(instance.host_name, None)
if host_operation is None:
host_operation = Operation(OPERATION_TYPE.START_INSTANCES)
host_operation = Operation(operation_type)
host_operations[instance.host_name] = host_operation
for host_name, host_operation in host_operations.iteritems():

View File

@ -5,6 +5,7 @@
#
from nfv_common import debug
from nfv_common import state_machine
from nfv_common import timers
from _instance_defs import INSTANCE_STATE, INSTANCE_EVENT
from _instance_tasks import StartTask
@ -24,6 +25,7 @@ class StartState(state_machine.State):
Entering start state
"""
DLOG.info("Entering state (%s) for %s." % (self.name, instance.name))
instance.action_fsm.wait_time = 0
instance.task = StartTask(instance)
instance.task.start()
@ -64,9 +66,9 @@ class StartState(state_machine.State):
return INSTANCE_STATE.INITIAL
elif INSTANCE_EVENT.TASK_COMPLETED == event:
DLOG.debug("Start completed for %s." % instance.name)
instance_director.instance_start_complete(instance, instance.host_name)
return INSTANCE_STATE.INITIAL
DLOG.debug("Start inprogress for %s." % instance.name)
instance.action_fsm.wait_time = \
timers.get_monotonic_timestamp_in_ms()
elif INSTANCE_EVENT.TASK_FAILED == event:
DLOG.info("Start failed for %s." % instance.name)
@ -82,6 +84,32 @@ class StartState(state_machine.State):
failed=False, timed_out=True)
return INSTANCE_STATE.INITIAL
elif INSTANCE_EVENT.NFVI_ENABLED == event:
instance_director.instance_start_complete(instance,
instance.host_name)
return INSTANCE_STATE.INITIAL
elif INSTANCE_EVENT.AUDIT == event:
if not instance.task.inprogress():
if instance.is_enabled():
instance_director.instance_start_complete(
instance, instance.host_name)
return INSTANCE_STATE.INITIAL
else:
now_ms = timers.get_monotonic_timestamp_in_ms()
secs_expired = \
(now_ms - instance.action_fsm.wait_time) / 1000
# Only wait 60 seconds for the instance to start.
if 60 <= secs_expired:
instance.fail_action(instance.action_fsm_action_type,
'timeout')
instance_director.instance_start_complete(
instance,
instance.host_name,
failed=False,
timed_out=True)
return INSTANCE_STATE.INITIAL
else:
DLOG.verbose("Ignoring %s event for %s." % (event, instance.name))