Neutron Host Management Robustness

If a network agent goes down in the middle of processing
a reschedule/rebalance operation, abort the current operation
so as not to schedule resources onto a down agent.  This is
necessary as the reschedule/rebalance operations may take a
fair amount of time, and users may lock hosts (bringing
down the network agents) at any time.

Change-Id: Ie933b517e4e4cea3beb7d00e3c65d2f538f14e5c
Story: 2003857
Task: 30500
Signed-off-by: Kevin Smith <kevin.smith@windriver.com>
This commit is contained in:
Kevin Smith 2019-04-12 16:28:08 -04:00
parent 9b600d7b9e
commit 037727447b
4 changed files with 476 additions and 8 deletions

View File

@ -60,7 +60,8 @@ def build_get_agents_response():
return get_agents_response
def build_get_dhcp_agent_networks_response(agent_id, use_strange_networks=False):
def build_get_dhcp_agent_networks_response(agent_id,
use_strange_networks=False):
get_dhcp_agent_networks_response = dict()
get_dhcp_agent_networks_response['completed'] = True
get_dhcp_agent_networks_response['reason'] = ''
@ -238,7 +239,188 @@ class TestNeutronDHCPRebalance(testcase.NFVTestCase):
print("less than 2 agents, nothing to do")
break
if loopcount > MAX_LOOPCOUNT:
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
@mock.patch('nfv_vim.nfvi.nfvi_get_dhcp_agent_networks',
fake_nfvi_get_dhcp_agent_networks)
def test_rebalance_down_host_abort_w_api_calls(self):
initial_network_count = 0
initial_network_config = list()
abort_state_list = [DHCP_REBALANCE_STATE.GET_DHCP_AGENTS,
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT,
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
DHCP_REBALANCE_STATE.RESCHEDULE_DOWN_AGENT,
DHCP_REBALANCE_STATE.HOLD_OFF,
DHCP_REBALANCE_STATE.DONE]
for x in range(1, 200):
_DHCPRebalance.network_diff_threshold = random.randint(1, 4)
add_rebalance_work_dhcp('compute-0', True)
loopcount = 0
if DEBUG_PRINTING:
print("HOST DOWN TEST NUMBER %s" % str(x))
aborted = False
doing_abort = False
abort_state = random.randint(0, len(abort_state_list) - 1)
while True:
loopcount += 1
old_state = _DHCPRebalance.get_state()
if old_state == (abort_state_list[abort_state]) and (not aborted):
aborted = True
doing_abort = True
add_rebalance_work_dhcp('compute-1', True)
if DEBUG_PRINTING:
print("host-down adding compute-1 down in state: %s." %
old_state)
_run_state_machine()
new_state = _DHCPRebalance.get_state()
if doing_abort:
doing_abort = False
if (old_state != DHCP_REBALANCE_STATE.DONE) and \
(old_state != DHCP_REBALANCE_STATE.HOLD_OFF):
if _DHCPRebalance.num_dhcp_agents < 2:
assert(new_state == DHCP_REBALANCE_STATE.DONE)
else:
assert(new_state ==
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT)
if ((old_state ==
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT) and
(new_state ==
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS)):
for idx in range(len(_DHCPRebalance.num_networks_on_agents)):
initial_network_config.append(
_DHCPRebalance.num_networks_on_agents[idx])
initial_network_count = \
sum(_DHCPRebalance.num_networks_on_agents)
if (_DHCPRebalance.get_state() == DHCP_REBALANCE_STATE.DONE) and \
(len(_DHCPRebalance.host_down_queue) == 0):
final_network_count = \
sum(_DHCPRebalance.num_networks_on_agents)
if DEBUG_PRINTING:
print("network_diff_threshold: %s" %
_DHCPRebalance.network_diff_threshold)
print("initial_network_count: %s, "
"final_network_count: %s" %
(initial_network_count, final_network_count))
print("initial num_networks_on_agents: %s, "
"final num_networks_on_agents: %s" %
(initial_network_config,
_DHCPRebalance.num_networks_on_agents))
del initial_network_config[:]
if len(_DHCPRebalance.num_networks_on_agents) > 2:
num_networks_length = \
len(_DHCPRebalance.num_networks_on_agents)
assert ((num_networks_length == 0) or
_DHCPRebalance.num_networks_on_agents[0] == 0)
assert (initial_network_count == final_network_count)
else:
if DEBUG_PRINTING:
print("less than 2 agents, nothing to do")
break
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
@mock.patch('nfv_vim.nfvi.nfvi_get_dhcp_agent_networks',
fake_nfvi_get_dhcp_agent_networks)
def test_rebalance_up_host_abort_randomized_w_api_calls(self):
initial_network_count = 0
initial_network_config = list()
abort_state_list = [DHCP_REBALANCE_STATE.GET_DHCP_AGENTS,
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT,
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
DHCP_REBALANCE_STATE.RESCHEDULE_NEW_AGENT,
DHCP_REBALANCE_STATE.HOLD_OFF,
DHCP_REBALANCE_STATE.DONE]
for x in range(1, 200):
_DHCPRebalance.network_diff_threshold = random.randint(1, 4)
add_rebalance_work_dhcp('compute-0', False)
aborted = False
doing_abort = False
abort_state = random.randint(0, len(abort_state_list) - 1)
loopcount = 0
if DEBUG_PRINTING:
print("HOST UP TEST NUMBER %s" % str(x))
while True:
loopcount += 1
old_state = _DHCPRebalance.get_state()
if old_state == (abort_state_list[abort_state]) and (not aborted):
aborted = True
doing_abort = True
add_rebalance_work_dhcp('compute-1', True)
if DEBUG_PRINTING:
print("host-up adding compute-1 down in state: %s." %
old_state)
_run_state_machine()
new_state = _DHCPRebalance.get_state()
if doing_abort:
doing_abort = False
if (old_state != DHCP_REBALANCE_STATE.DONE) and \
(old_state != DHCP_REBALANCE_STATE.HOLD_OFF):
assert(new_state ==
DHCP_REBALANCE_STATE.HOLD_OFF)
if ((old_state ==
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT) and
((new_state ==
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS) or
(new_state == DHCP_REBALANCE_STATE.DONE))):
# new_state DONE is for already balanced case
for idx in range(len(_DHCPRebalance.num_networks_on_agents)):
initial_network_config.append(
_DHCPRebalance.num_networks_on_agents[idx])
initial_network_count = sum(
_DHCPRebalance.num_networks_on_agents)
if ((_DHCPRebalance.get_state() == DHCP_REBALANCE_STATE.DONE) and
(len(_DHCPRebalance.host_up_queue) == 0) and
(len(_DHCPRebalance.host_down_queue) == 0)):
final_network_count = sum(
_DHCPRebalance.num_networks_on_agents)
if DEBUG_PRINTING:
print("network_diff_threshold: %s" %
_DHCPRebalance.network_diff_threshold)
print("initial_network_count: %s, "
"final_network_count: %s" %
(initial_network_count, final_network_count))
print("initial num_networks_on_agents: %s, "
"final num_networks_on_agents: %s" %
(initial_network_config,
_DHCPRebalance.num_networks_on_agents))
del initial_network_config[:]
if len(_DHCPRebalance.num_networks_on_agents) > 2:
assert (initial_network_count == final_network_count)
assert (max(_DHCPRebalance.num_networks_on_agents) -
min(_DHCPRebalance.num_networks_on_agents) <=
_DHCPRebalance.network_diff_threshold)
else:
if DEBUG_PRINTING:
print("less than 2 agents, nothing to do")
break
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
@ -298,7 +480,7 @@ class TestNeutronDHCPRebalance(testcase.NFVTestCase):
print("less than 2 agents, nothing to do")
break
if loopcount > MAX_LOOPCOUNT:
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
@ -355,7 +537,7 @@ class TestNeutronDHCPRebalance(testcase.NFVTestCase):
print("less than 2 agents, nothing to do")
break
if loopcount > MAX_LOOPCOUNT:
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT

View File

@ -277,7 +277,99 @@ class TestNeutronRebalance2(testcase.NFVTestCase):
print("less than 2 agents, nothing to do")
break
if loopcount > MAX_LOOPCOUNT:
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
def test_rebalance_down_host_abort_randomized_w_api_calls(self):
initial_router_count = 0
initial_router_config = list()
abort_state_list = [L3_REBALANCE_STATE.GET_NETWORK_AGENTS,
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT,
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS,
L3_REBALANCE_STATE.GET_PHYSICAL_NETWORK_FROM_NETWORKS,
L3_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
L3_REBALANCE_STATE.RESCHEDULE_DOWN_AGENT,
L3_REBALANCE_STATE.HOLD_OFF,
L3_REBALANCE_STATE.DONE]
for x in range(1, 10):
_L3Rebalance.router_diff_threshold = random.randint(1, 4)
add_rebalance_work_l3('compute-0', True)
aborted = False
doing_abort = False
abort_state = random.randint(0, len(abort_state_list) - 1)
loopcount = 0
if DEBUG_PRINTING:
print("HOST DOWN TEST NUMBER %s" % str(x))
while True:
loopcount += 1
old_state = _L3Rebalance.get_state()
if old_state == (abort_state_list[abort_state]) and (not aborted):
aborted = True
doing_abort = True
add_rebalance_work_l3('compute-1', True)
if DEBUG_PRINTING:
print("host-up adding compute-1 down in state: %s." %
old_state)
_run_state_machine()
new_state = _L3Rebalance.get_state()
if doing_abort:
doing_abort = False
if (old_state != L3_REBALANCE_STATE.DONE) and \
(old_state != L3_REBALANCE_STATE.HOLD_OFF):
if _L3Rebalance.num_l3agents < 2:
assert(new_state == L3_REBALANCE_STATE.DONE)
else:
assert(new_state ==
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT)
if ((old_state ==
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT) and
(new_state ==
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS)):
for idx in range(len(_L3Rebalance.num_routers_on_agents)):
initial_router_config.append(
_L3Rebalance.num_routers_on_agents[idx])
initial_router_count = \
sum(_L3Rebalance.num_routers_on_agents)
if (_L3Rebalance.get_state() == L3_REBALANCE_STATE.DONE) and \
(len(_L3Rebalance.host_down_queue) == 0):
final_router_count = \
sum(_L3Rebalance.num_routers_on_agents)
if DEBUG_PRINTING:
print("router_diff_threshold: %s" %
_L3Rebalance.router_diff_threshold)
print("initial_router_count: %s, "
"final_router_count: %s" %
(initial_router_count, final_router_count))
print("initial num_routers_on_agents: %s, "
"final num_routers_on_agents: %s" %
(initial_router_config,
_L3Rebalance.num_routers_on_agents))
del initial_router_config[:]
if len(_L3Rebalance.num_routers_on_agents) > 2:
num_routers_length = \
len(_L3Rebalance.num_routers_on_agents)
assert ((num_routers_length == 0) or
_L3Rebalance.num_routers_on_agents[0] == 0)
assert (initial_router_count == final_router_count)
else:
if DEBUG_PRINTING:
print("less than 2 agents, nothing to do")
break
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
@ -335,7 +427,98 @@ class TestNeutronRebalance2(testcase.NFVTestCase):
print("less than 2 agents, nothing to do")
break
if loopcount > MAX_LOOPCOUNT:
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT
def test_rebalance_up_host_abort_randomized_w_api_calls(self):
initial_router_count = 0
initial_router_config = list()
abort_state_list = [L3_REBALANCE_STATE.GET_NETWORK_AGENTS,
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT,
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS,
L3_REBALANCE_STATE.GET_PHYSICAL_NETWORK_FROM_NETWORKS,
L3_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
L3_REBALANCE_STATE.RESCHEDULE_NEW_AGENT,
L3_REBALANCE_STATE.HOLD_OFF,
L3_REBALANCE_STATE.DONE]
for x in range(1, 10):
_L3Rebalance.router_diff_threshold = random.randint(1, 4)
add_rebalance_work_l3('compute-0', False)
aborted = False
doing_abort = False
abort_state = random.randint(0, len(abort_state_list) - 1)
loopcount = 0
if DEBUG_PRINTING:
print("HOST UP TEST NUMBER %s" % str(x))
while True:
loopcount += 1
old_state = _L3Rebalance.get_state()
if old_state == (abort_state_list[abort_state]) and (not aborted):
aborted = True
doing_abort = True
add_rebalance_work_l3('compute-1', True)
if DEBUG_PRINTING:
print("host-up adding compute-1 down in state: %s." %
old_state)
_run_state_machine()
new_state = _L3Rebalance.get_state()
if doing_abort:
doing_abort = False
if (old_state != L3_REBALANCE_STATE.DONE) and \
(old_state != L3_REBALANCE_STATE.HOLD_OFF):
assert(new_state ==
L3_REBALANCE_STATE.HOLD_OFF)
if ((old_state ==
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT) and
((new_state ==
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS) or
(new_state == L3_REBALANCE_STATE.DONE))):
# new_state DONE is for already balanced case
for idx in range(len(_L3Rebalance.num_routers_on_agents)):
initial_router_config.append(
_L3Rebalance.num_routers_on_agents[idx])
initial_router_count = sum(
_L3Rebalance.num_routers_on_agents)
if ((_L3Rebalance.get_state() == L3_REBALANCE_STATE.DONE) and
(len(_L3Rebalance.host_up_queue) == 0) and
(len(_L3Rebalance.host_down_queue) == 0)):
final_router_count = sum(
_L3Rebalance.num_routers_on_agents)
if DEBUG_PRINTING:
print("router_diff_threshold: %s" %
_L3Rebalance.router_diff_threshold)
print("initial_router_count: %s, "
"final_router_count: %s" %
(initial_router_count, final_router_count))
print("initial num_routers_on_agents: %s, "
"final num_routers_on_agents: %s" %
(initial_router_config,
_L3Rebalance.num_routers_on_agents))
del initial_router_config[:]
if len(_L3Rebalance.num_routers_on_agents) > 2:
assert (initial_router_count == final_router_count)
assert (max(_L3Rebalance.num_routers_on_agents) -
min(_L3Rebalance.num_routers_on_agents) <=
_L3Rebalance.router_diff_threshold)
else:
if DEBUG_PRINTING:
print("less than 2 agents, nothing to do")
break
if loopcount >= MAX_LOOPCOUNT:
print("Loopcount exit!!! loopcount:%s" % loopcount)
assert loopcount < MAX_LOOPCOUNT

View File

@ -93,6 +93,32 @@ class DHCPAgentRebalance(object):
# queues that maintain host names of hosts coming up and going down.
self.host_up_queue = list()
self.host_down_queue = list()
# whether to abort and restart.
self.abort = False
def set_abort(self):
self.abort = True
def check_abort(self):
# if abort flag is set, it means that there has been a change
# in status of one of the agents, we need to restart the
# algorithm
if self.abort:
if self.get_working_host() is not None:
# We were processing a host down.
# Go to GET_DHCP_AGENTS and restart the host down
# processing for the current host
self.set_state(DHCP_REBALANCE_STATE.GET_DHCP_AGENTS)
else:
# We were processing a host up.
# Go to HOLD_OFF so we can service the host down
# that just came in first.
self.set_state(DHCP_REBALANCE_STATE.HOLD_OFF)
# enqueue another host up rebalance to trigger host up
# rebalancing after processing the host down.
self.host_up_queue.append('abort-restart')
self.abort = False
DLOG.info("Aborting current reschedule and restarting")
def reinit(self):
self.num_dhcp_agents = 0
@ -284,7 +310,30 @@ class DHCPAgentRebalance(object):
def add_rebalance_work(self, host_name, host_is_going_down):
if host_is_going_down:
self.host_down_queue.append(host_name)
# Only add this host to the queue if it is not
# already in it, and we are not in the process of
# performing a host down reschedule for that host.
if host_name not in self.host_down_queue:
if (self.state != DHCP_REBALANCE_STATE.DONE) and \
(self.state != DHCP_REBALANCE_STATE.HOLD_OFF):
# state machine is in progress.
if (self.get_working_host() != host_name):
# We are in the progress of rescheduling,
# but not due to processing a down host
# reschedule for the host that is to be queued.
# We need to abort immediately and restart,
# lest we reschedule networks onto a down host.
self.set_abort()
self.host_down_queue.append(host_name)
else:
DLOG.debug("Not adding host down entry as host "
"down processing for this host already "
"in progress")
else:
# state machine is not in progress.
self.host_down_queue.append(host_name)
else:
DLOG.debug("Not adding duplicate host down queue entry")
else:
self.host_up_queue.append(host_name)
@ -818,6 +867,8 @@ def _run_state_machine():
_DHCPRebalance.state_machine_in_progress = True
_DHCPRebalance.check_abort()
my_state = _DHCPRebalance.get_state()
DLOG.debug("Network Rebalance State %s" % my_state)
if my_state == DHCP_REBALANCE_STATE.GET_DHCP_AGENTS:

View File

@ -99,6 +99,32 @@ class L3AgentRebalance(object):
# queues that maintain host names of hosts coming up and going down.
self.host_up_queue = list()
self.host_down_queue = list()
# whether to abort and restart.
self.abort = False
def set_abort(self):
self.abort = True
def check_abort(self):
# if abort flag is set, it means that there has been a change
# in status of one of the agents, we need to restart the
# algorithm.
if self.abort:
if self.get_working_host() is not None:
# We were processing a host down.
# Go to GET_NETWORK_AGENTS and restart the host down
# processing for the current host
self.set_state(L3_REBALANCE_STATE.GET_NETWORK_AGENTS)
else:
# We were processing a host up.
# Go to HOLD_OFF so we can service the host down
# that just came in first.
self.set_state(L3_REBALANCE_STATE.HOLD_OFF)
# enqueue another host up rebalance to trigger host up
# rebalancing after processing the host down.
self.host_up_queue.append('abort-restart')
self.abort = False
DLOG.info("Aborting current reschedule and restarting")
def reinit(self):
self.num_l3agents = 0
@ -380,7 +406,30 @@ class L3AgentRebalance(object):
def add_rebalance_work(self, host_name, host_is_going_down):
if host_is_going_down:
self.host_down_queue.append(host_name)
# Only add this host to the queue if it is not
# already in it, and we are not in the process of
# performing a host down reschedule for that host.
if host_name not in self.host_down_queue:
if (self.state != L3_REBALANCE_STATE.DONE) and \
(self.state != L3_REBALANCE_STATE.HOLD_OFF):
# state machine is in progress.
if (self.get_working_host() != host_name):
# We are in the progress of rescheduling,
# but not due to processing a down host
# reschedule for the host that is to be queued.
# We need to abort immediately and restart,
# lest we reschedule routers onto a down host.
self.set_abort()
self.host_down_queue.append(host_name)
else:
DLOG.debug("Not adding host down entry as host "
"down processing for this host already "
"in progress")
else:
# state machine is not in progress.
self.host_down_queue.append(host_name)
else:
DLOG.debug("Not adding duplicate host down queue entry")
else:
self.host_up_queue.append(host_name)
@ -1009,8 +1058,11 @@ def _run_state_machine():
_L3Rebalance.state_machine_in_progress = True
_L3Rebalance.check_abort()
my_state = _L3Rebalance.get_state()
DLOG.debug("Network Rebalance State %s" % my_state)
if my_state == L3_REBALANCE_STATE.GET_NETWORK_AGENTS:
_L3Rebalance.reinit()