Neutron Host Management Robustness
If a network agent goes down in the middle of processing a reschedule/rebalance operation, abort the current operation so as not to schedule resources onto a down agent. This is necessary as the reschedule/rebalance operations may take a fair amount of time, and users may lock hosts (bringing down the network agents) at any time. Change-Id: Ie933b517e4e4cea3beb7d00e3c65d2f538f14e5c Story: 2003857 Task: 30500 Signed-off-by: Kevin Smith <kevin.smith@windriver.com>
This commit is contained in:
parent
9b600d7b9e
commit
037727447b
@ -60,7 +60,8 @@ def build_get_agents_response():
|
||||
return get_agents_response
|
||||
|
||||
|
||||
def build_get_dhcp_agent_networks_response(agent_id, use_strange_networks=False):
|
||||
def build_get_dhcp_agent_networks_response(agent_id,
|
||||
use_strange_networks=False):
|
||||
get_dhcp_agent_networks_response = dict()
|
||||
get_dhcp_agent_networks_response['completed'] = True
|
||||
get_dhcp_agent_networks_response['reason'] = ''
|
||||
@ -238,7 +239,188 @@ class TestNeutronDHCPRebalance(testcase.NFVTestCase):
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount > MAX_LOOPCOUNT:
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
|
||||
@mock.patch('nfv_vim.nfvi.nfvi_get_dhcp_agent_networks',
|
||||
fake_nfvi_get_dhcp_agent_networks)
|
||||
def test_rebalance_down_host_abort_w_api_calls(self):
|
||||
initial_network_count = 0
|
||||
initial_network_config = list()
|
||||
|
||||
abort_state_list = [DHCP_REBALANCE_STATE.GET_DHCP_AGENTS,
|
||||
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT,
|
||||
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
|
||||
DHCP_REBALANCE_STATE.RESCHEDULE_DOWN_AGENT,
|
||||
DHCP_REBALANCE_STATE.HOLD_OFF,
|
||||
DHCP_REBALANCE_STATE.DONE]
|
||||
|
||||
for x in range(1, 200):
|
||||
_DHCPRebalance.network_diff_threshold = random.randint(1, 4)
|
||||
add_rebalance_work_dhcp('compute-0', True)
|
||||
loopcount = 0
|
||||
if DEBUG_PRINTING:
|
||||
print("HOST DOWN TEST NUMBER %s" % str(x))
|
||||
|
||||
aborted = False
|
||||
doing_abort = False
|
||||
abort_state = random.randint(0, len(abort_state_list) - 1)
|
||||
while True:
|
||||
loopcount += 1
|
||||
|
||||
old_state = _DHCPRebalance.get_state()
|
||||
|
||||
if old_state == (abort_state_list[abort_state]) and (not aborted):
|
||||
aborted = True
|
||||
doing_abort = True
|
||||
add_rebalance_work_dhcp('compute-1', True)
|
||||
if DEBUG_PRINTING:
|
||||
print("host-down adding compute-1 down in state: %s." %
|
||||
old_state)
|
||||
|
||||
_run_state_machine()
|
||||
new_state = _DHCPRebalance.get_state()
|
||||
|
||||
if doing_abort:
|
||||
doing_abort = False
|
||||
if (old_state != DHCP_REBALANCE_STATE.DONE) and \
|
||||
(old_state != DHCP_REBALANCE_STATE.HOLD_OFF):
|
||||
if _DHCPRebalance.num_dhcp_agents < 2:
|
||||
assert(new_state == DHCP_REBALANCE_STATE.DONE)
|
||||
else:
|
||||
assert(new_state ==
|
||||
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT)
|
||||
|
||||
if ((old_state ==
|
||||
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT) and
|
||||
(new_state ==
|
||||
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS)):
|
||||
for idx in range(len(_DHCPRebalance.num_networks_on_agents)):
|
||||
initial_network_config.append(
|
||||
_DHCPRebalance.num_networks_on_agents[idx])
|
||||
initial_network_count = \
|
||||
sum(_DHCPRebalance.num_networks_on_agents)
|
||||
|
||||
if (_DHCPRebalance.get_state() == DHCP_REBALANCE_STATE.DONE) and \
|
||||
(len(_DHCPRebalance.host_down_queue) == 0):
|
||||
final_network_count = \
|
||||
sum(_DHCPRebalance.num_networks_on_agents)
|
||||
if DEBUG_PRINTING:
|
||||
print("network_diff_threshold: %s" %
|
||||
_DHCPRebalance.network_diff_threshold)
|
||||
print("initial_network_count: %s, "
|
||||
"final_network_count: %s" %
|
||||
(initial_network_count, final_network_count))
|
||||
print("initial num_networks_on_agents: %s, "
|
||||
"final num_networks_on_agents: %s" %
|
||||
(initial_network_config,
|
||||
_DHCPRebalance.num_networks_on_agents))
|
||||
del initial_network_config[:]
|
||||
if len(_DHCPRebalance.num_networks_on_agents) > 2:
|
||||
num_networks_length = \
|
||||
len(_DHCPRebalance.num_networks_on_agents)
|
||||
assert ((num_networks_length == 0) or
|
||||
_DHCPRebalance.num_networks_on_agents[0] == 0)
|
||||
assert (initial_network_count == final_network_count)
|
||||
else:
|
||||
if DEBUG_PRINTING:
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
|
||||
@mock.patch('nfv_vim.nfvi.nfvi_get_dhcp_agent_networks',
|
||||
fake_nfvi_get_dhcp_agent_networks)
|
||||
def test_rebalance_up_host_abort_randomized_w_api_calls(self):
|
||||
initial_network_count = 0
|
||||
initial_network_config = list()
|
||||
|
||||
abort_state_list = [DHCP_REBALANCE_STATE.GET_DHCP_AGENTS,
|
||||
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT,
|
||||
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
|
||||
DHCP_REBALANCE_STATE.RESCHEDULE_NEW_AGENT,
|
||||
DHCP_REBALANCE_STATE.HOLD_OFF,
|
||||
DHCP_REBALANCE_STATE.DONE]
|
||||
|
||||
for x in range(1, 200):
|
||||
_DHCPRebalance.network_diff_threshold = random.randint(1, 4)
|
||||
add_rebalance_work_dhcp('compute-0', False)
|
||||
|
||||
aborted = False
|
||||
doing_abort = False
|
||||
abort_state = random.randint(0, len(abort_state_list) - 1)
|
||||
|
||||
loopcount = 0
|
||||
if DEBUG_PRINTING:
|
||||
print("HOST UP TEST NUMBER %s" % str(x))
|
||||
|
||||
while True:
|
||||
loopcount += 1
|
||||
|
||||
old_state = _DHCPRebalance.get_state()
|
||||
|
||||
if old_state == (abort_state_list[abort_state]) and (not aborted):
|
||||
aborted = True
|
||||
doing_abort = True
|
||||
add_rebalance_work_dhcp('compute-1', True)
|
||||
if DEBUG_PRINTING:
|
||||
print("host-up adding compute-1 down in state: %s." %
|
||||
old_state)
|
||||
|
||||
_run_state_machine()
|
||||
new_state = _DHCPRebalance.get_state()
|
||||
|
||||
if doing_abort:
|
||||
doing_abort = False
|
||||
if (old_state != DHCP_REBALANCE_STATE.DONE) and \
|
||||
(old_state != DHCP_REBALANCE_STATE.HOLD_OFF):
|
||||
assert(new_state ==
|
||||
DHCP_REBALANCE_STATE.HOLD_OFF)
|
||||
|
||||
if ((old_state ==
|
||||
DHCP_REBALANCE_STATE.GET_NETWORKS_HOSTED_ON_AGENT) and
|
||||
((new_state ==
|
||||
DHCP_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS) or
|
||||
(new_state == DHCP_REBALANCE_STATE.DONE))):
|
||||
# new_state DONE is for already balanced case
|
||||
for idx in range(len(_DHCPRebalance.num_networks_on_agents)):
|
||||
initial_network_config.append(
|
||||
_DHCPRebalance.num_networks_on_agents[idx])
|
||||
initial_network_count = sum(
|
||||
_DHCPRebalance.num_networks_on_agents)
|
||||
|
||||
if ((_DHCPRebalance.get_state() == DHCP_REBALANCE_STATE.DONE) and
|
||||
(len(_DHCPRebalance.host_up_queue) == 0) and
|
||||
(len(_DHCPRebalance.host_down_queue) == 0)):
|
||||
final_network_count = sum(
|
||||
_DHCPRebalance.num_networks_on_agents)
|
||||
if DEBUG_PRINTING:
|
||||
print("network_diff_threshold: %s" %
|
||||
_DHCPRebalance.network_diff_threshold)
|
||||
print("initial_network_count: %s, "
|
||||
"final_network_count: %s" %
|
||||
(initial_network_count, final_network_count))
|
||||
print("initial num_networks_on_agents: %s, "
|
||||
"final num_networks_on_agents: %s" %
|
||||
(initial_network_config,
|
||||
_DHCPRebalance.num_networks_on_agents))
|
||||
del initial_network_config[:]
|
||||
if len(_DHCPRebalance.num_networks_on_agents) > 2:
|
||||
assert (initial_network_count == final_network_count)
|
||||
assert (max(_DHCPRebalance.num_networks_on_agents) -
|
||||
min(_DHCPRebalance.num_networks_on_agents) <=
|
||||
_DHCPRebalance.network_diff_threshold)
|
||||
else:
|
||||
if DEBUG_PRINTING:
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
@ -298,7 +480,7 @@ class TestNeutronDHCPRebalance(testcase.NFVTestCase):
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount > MAX_LOOPCOUNT:
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
@ -355,7 +537,7 @@ class TestNeutronDHCPRebalance(testcase.NFVTestCase):
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount > MAX_LOOPCOUNT:
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
|
@ -277,7 +277,99 @@ class TestNeutronRebalance2(testcase.NFVTestCase):
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount > MAX_LOOPCOUNT:
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
|
||||
def test_rebalance_down_host_abort_randomized_w_api_calls(self):
|
||||
initial_router_count = 0
|
||||
initial_router_config = list()
|
||||
|
||||
abort_state_list = [L3_REBALANCE_STATE.GET_NETWORK_AGENTS,
|
||||
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT,
|
||||
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS,
|
||||
L3_REBALANCE_STATE.GET_PHYSICAL_NETWORK_FROM_NETWORKS,
|
||||
L3_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
|
||||
L3_REBALANCE_STATE.RESCHEDULE_DOWN_AGENT,
|
||||
L3_REBALANCE_STATE.HOLD_OFF,
|
||||
L3_REBALANCE_STATE.DONE]
|
||||
|
||||
for x in range(1, 10):
|
||||
_L3Rebalance.router_diff_threshold = random.randint(1, 4)
|
||||
add_rebalance_work_l3('compute-0', True)
|
||||
|
||||
aborted = False
|
||||
doing_abort = False
|
||||
abort_state = random.randint(0, len(abort_state_list) - 1)
|
||||
|
||||
loopcount = 0
|
||||
if DEBUG_PRINTING:
|
||||
print("HOST DOWN TEST NUMBER %s" % str(x))
|
||||
|
||||
while True:
|
||||
loopcount += 1
|
||||
|
||||
old_state = _L3Rebalance.get_state()
|
||||
|
||||
if old_state == (abort_state_list[abort_state]) and (not aborted):
|
||||
aborted = True
|
||||
doing_abort = True
|
||||
add_rebalance_work_l3('compute-1', True)
|
||||
if DEBUG_PRINTING:
|
||||
print("host-up adding compute-1 down in state: %s." %
|
||||
old_state)
|
||||
|
||||
_run_state_machine()
|
||||
new_state = _L3Rebalance.get_state()
|
||||
|
||||
if doing_abort:
|
||||
doing_abort = False
|
||||
if (old_state != L3_REBALANCE_STATE.DONE) and \
|
||||
(old_state != L3_REBALANCE_STATE.HOLD_OFF):
|
||||
if _L3Rebalance.num_l3agents < 2:
|
||||
assert(new_state == L3_REBALANCE_STATE.DONE)
|
||||
else:
|
||||
assert(new_state ==
|
||||
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT)
|
||||
|
||||
if ((old_state ==
|
||||
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT) and
|
||||
(new_state ==
|
||||
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS)):
|
||||
for idx in range(len(_L3Rebalance.num_routers_on_agents)):
|
||||
initial_router_config.append(
|
||||
_L3Rebalance.num_routers_on_agents[idx])
|
||||
initial_router_count = \
|
||||
sum(_L3Rebalance.num_routers_on_agents)
|
||||
|
||||
if (_L3Rebalance.get_state() == L3_REBALANCE_STATE.DONE) and \
|
||||
(len(_L3Rebalance.host_down_queue) == 0):
|
||||
final_router_count = \
|
||||
sum(_L3Rebalance.num_routers_on_agents)
|
||||
if DEBUG_PRINTING:
|
||||
print("router_diff_threshold: %s" %
|
||||
_L3Rebalance.router_diff_threshold)
|
||||
print("initial_router_count: %s, "
|
||||
"final_router_count: %s" %
|
||||
(initial_router_count, final_router_count))
|
||||
print("initial num_routers_on_agents: %s, "
|
||||
"final num_routers_on_agents: %s" %
|
||||
(initial_router_config,
|
||||
_L3Rebalance.num_routers_on_agents))
|
||||
del initial_router_config[:]
|
||||
if len(_L3Rebalance.num_routers_on_agents) > 2:
|
||||
num_routers_length = \
|
||||
len(_L3Rebalance.num_routers_on_agents)
|
||||
assert ((num_routers_length == 0) or
|
||||
_L3Rebalance.num_routers_on_agents[0] == 0)
|
||||
assert (initial_router_count == final_router_count)
|
||||
else:
|
||||
if DEBUG_PRINTING:
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
@ -335,7 +427,98 @@ class TestNeutronRebalance2(testcase.NFVTestCase):
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount > MAX_LOOPCOUNT:
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
|
||||
def test_rebalance_up_host_abort_randomized_w_api_calls(self):
|
||||
initial_router_count = 0
|
||||
initial_router_config = list()
|
||||
|
||||
abort_state_list = [L3_REBALANCE_STATE.GET_NETWORK_AGENTS,
|
||||
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT,
|
||||
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS,
|
||||
L3_REBALANCE_STATE.GET_PHYSICAL_NETWORK_FROM_NETWORKS,
|
||||
L3_REBALANCE_STATE.GET_HOST_PHYSICAL_NETWORKS,
|
||||
L3_REBALANCE_STATE.RESCHEDULE_NEW_AGENT,
|
||||
L3_REBALANCE_STATE.HOLD_OFF,
|
||||
L3_REBALANCE_STATE.DONE]
|
||||
|
||||
for x in range(1, 10):
|
||||
_L3Rebalance.router_diff_threshold = random.randint(1, 4)
|
||||
add_rebalance_work_l3('compute-0', False)
|
||||
|
||||
aborted = False
|
||||
doing_abort = False
|
||||
abort_state = random.randint(0, len(abort_state_list) - 1)
|
||||
|
||||
loopcount = 0
|
||||
if DEBUG_PRINTING:
|
||||
print("HOST UP TEST NUMBER %s" % str(x))
|
||||
|
||||
while True:
|
||||
loopcount += 1
|
||||
|
||||
old_state = _L3Rebalance.get_state()
|
||||
|
||||
if old_state == (abort_state_list[abort_state]) and (not aborted):
|
||||
aborted = True
|
||||
doing_abort = True
|
||||
add_rebalance_work_l3('compute-1', True)
|
||||
if DEBUG_PRINTING:
|
||||
print("host-up adding compute-1 down in state: %s." %
|
||||
old_state)
|
||||
|
||||
_run_state_machine()
|
||||
new_state = _L3Rebalance.get_state()
|
||||
|
||||
if doing_abort:
|
||||
doing_abort = False
|
||||
if (old_state != L3_REBALANCE_STATE.DONE) and \
|
||||
(old_state != L3_REBALANCE_STATE.HOLD_OFF):
|
||||
assert(new_state ==
|
||||
L3_REBALANCE_STATE.HOLD_OFF)
|
||||
|
||||
if ((old_state ==
|
||||
L3_REBALANCE_STATE.GET_ROUTERS_HOSTED_ON_AGENT) and
|
||||
((new_state ==
|
||||
L3_REBALANCE_STATE.GET_ROUTER_PORT_NETWORKS) or
|
||||
(new_state == L3_REBALANCE_STATE.DONE))):
|
||||
# new_state DONE is for already balanced case
|
||||
for idx in range(len(_L3Rebalance.num_routers_on_agents)):
|
||||
initial_router_config.append(
|
||||
_L3Rebalance.num_routers_on_agents[idx])
|
||||
initial_router_count = sum(
|
||||
_L3Rebalance.num_routers_on_agents)
|
||||
|
||||
if ((_L3Rebalance.get_state() == L3_REBALANCE_STATE.DONE) and
|
||||
(len(_L3Rebalance.host_up_queue) == 0) and
|
||||
(len(_L3Rebalance.host_down_queue) == 0)):
|
||||
final_router_count = sum(
|
||||
_L3Rebalance.num_routers_on_agents)
|
||||
if DEBUG_PRINTING:
|
||||
print("router_diff_threshold: %s" %
|
||||
_L3Rebalance.router_diff_threshold)
|
||||
print("initial_router_count: %s, "
|
||||
"final_router_count: %s" %
|
||||
(initial_router_count, final_router_count))
|
||||
print("initial num_routers_on_agents: %s, "
|
||||
"final num_routers_on_agents: %s" %
|
||||
(initial_router_config,
|
||||
_L3Rebalance.num_routers_on_agents))
|
||||
del initial_router_config[:]
|
||||
if len(_L3Rebalance.num_routers_on_agents) > 2:
|
||||
assert (initial_router_count == final_router_count)
|
||||
assert (max(_L3Rebalance.num_routers_on_agents) -
|
||||
min(_L3Rebalance.num_routers_on_agents) <=
|
||||
_L3Rebalance.router_diff_threshold)
|
||||
else:
|
||||
if DEBUG_PRINTING:
|
||||
print("less than 2 agents, nothing to do")
|
||||
break
|
||||
|
||||
if loopcount >= MAX_LOOPCOUNT:
|
||||
print("Loopcount exit!!! loopcount:%s" % loopcount)
|
||||
|
||||
assert loopcount < MAX_LOOPCOUNT
|
||||
|
@ -93,6 +93,32 @@ class DHCPAgentRebalance(object):
|
||||
# queues that maintain host names of hosts coming up and going down.
|
||||
self.host_up_queue = list()
|
||||
self.host_down_queue = list()
|
||||
# whether to abort and restart.
|
||||
self.abort = False
|
||||
|
||||
def set_abort(self):
|
||||
self.abort = True
|
||||
|
||||
def check_abort(self):
|
||||
# if abort flag is set, it means that there has been a change
|
||||
# in status of one of the agents, we need to restart the
|
||||
# algorithm
|
||||
if self.abort:
|
||||
if self.get_working_host() is not None:
|
||||
# We were processing a host down.
|
||||
# Go to GET_DHCP_AGENTS and restart the host down
|
||||
# processing for the current host
|
||||
self.set_state(DHCP_REBALANCE_STATE.GET_DHCP_AGENTS)
|
||||
else:
|
||||
# We were processing a host up.
|
||||
# Go to HOLD_OFF so we can service the host down
|
||||
# that just came in first.
|
||||
self.set_state(DHCP_REBALANCE_STATE.HOLD_OFF)
|
||||
# enqueue another host up rebalance to trigger host up
|
||||
# rebalancing after processing the host down.
|
||||
self.host_up_queue.append('abort-restart')
|
||||
self.abort = False
|
||||
DLOG.info("Aborting current reschedule and restarting")
|
||||
|
||||
def reinit(self):
|
||||
self.num_dhcp_agents = 0
|
||||
@ -284,7 +310,30 @@ class DHCPAgentRebalance(object):
|
||||
|
||||
def add_rebalance_work(self, host_name, host_is_going_down):
|
||||
if host_is_going_down:
|
||||
self.host_down_queue.append(host_name)
|
||||
# Only add this host to the queue if it is not
|
||||
# already in it, and we are not in the process of
|
||||
# performing a host down reschedule for that host.
|
||||
if host_name not in self.host_down_queue:
|
||||
if (self.state != DHCP_REBALANCE_STATE.DONE) and \
|
||||
(self.state != DHCP_REBALANCE_STATE.HOLD_OFF):
|
||||
# state machine is in progress.
|
||||
if (self.get_working_host() != host_name):
|
||||
# We are in the progress of rescheduling,
|
||||
# but not due to processing a down host
|
||||
# reschedule for the host that is to be queued.
|
||||
# We need to abort immediately and restart,
|
||||
# lest we reschedule networks onto a down host.
|
||||
self.set_abort()
|
||||
self.host_down_queue.append(host_name)
|
||||
else:
|
||||
DLOG.debug("Not adding host down entry as host "
|
||||
"down processing for this host already "
|
||||
"in progress")
|
||||
else:
|
||||
# state machine is not in progress.
|
||||
self.host_down_queue.append(host_name)
|
||||
else:
|
||||
DLOG.debug("Not adding duplicate host down queue entry")
|
||||
else:
|
||||
self.host_up_queue.append(host_name)
|
||||
|
||||
@ -818,6 +867,8 @@ def _run_state_machine():
|
||||
|
||||
_DHCPRebalance.state_machine_in_progress = True
|
||||
|
||||
_DHCPRebalance.check_abort()
|
||||
|
||||
my_state = _DHCPRebalance.get_state()
|
||||
DLOG.debug("Network Rebalance State %s" % my_state)
|
||||
if my_state == DHCP_REBALANCE_STATE.GET_DHCP_AGENTS:
|
||||
|
@ -99,6 +99,32 @@ class L3AgentRebalance(object):
|
||||
# queues that maintain host names of hosts coming up and going down.
|
||||
self.host_up_queue = list()
|
||||
self.host_down_queue = list()
|
||||
# whether to abort and restart.
|
||||
self.abort = False
|
||||
|
||||
def set_abort(self):
|
||||
self.abort = True
|
||||
|
||||
def check_abort(self):
|
||||
# if abort flag is set, it means that there has been a change
|
||||
# in status of one of the agents, we need to restart the
|
||||
# algorithm.
|
||||
if self.abort:
|
||||
if self.get_working_host() is not None:
|
||||
# We were processing a host down.
|
||||
# Go to GET_NETWORK_AGENTS and restart the host down
|
||||
# processing for the current host
|
||||
self.set_state(L3_REBALANCE_STATE.GET_NETWORK_AGENTS)
|
||||
else:
|
||||
# We were processing a host up.
|
||||
# Go to HOLD_OFF so we can service the host down
|
||||
# that just came in first.
|
||||
self.set_state(L3_REBALANCE_STATE.HOLD_OFF)
|
||||
# enqueue another host up rebalance to trigger host up
|
||||
# rebalancing after processing the host down.
|
||||
self.host_up_queue.append('abort-restart')
|
||||
self.abort = False
|
||||
DLOG.info("Aborting current reschedule and restarting")
|
||||
|
||||
def reinit(self):
|
||||
self.num_l3agents = 0
|
||||
@ -380,7 +406,30 @@ class L3AgentRebalance(object):
|
||||
|
||||
def add_rebalance_work(self, host_name, host_is_going_down):
|
||||
if host_is_going_down:
|
||||
self.host_down_queue.append(host_name)
|
||||
# Only add this host to the queue if it is not
|
||||
# already in it, and we are not in the process of
|
||||
# performing a host down reschedule for that host.
|
||||
if host_name not in self.host_down_queue:
|
||||
if (self.state != L3_REBALANCE_STATE.DONE) and \
|
||||
(self.state != L3_REBALANCE_STATE.HOLD_OFF):
|
||||
# state machine is in progress.
|
||||
if (self.get_working_host() != host_name):
|
||||
# We are in the progress of rescheduling,
|
||||
# but not due to processing a down host
|
||||
# reschedule for the host that is to be queued.
|
||||
# We need to abort immediately and restart,
|
||||
# lest we reschedule routers onto a down host.
|
||||
self.set_abort()
|
||||
self.host_down_queue.append(host_name)
|
||||
else:
|
||||
DLOG.debug("Not adding host down entry as host "
|
||||
"down processing for this host already "
|
||||
"in progress")
|
||||
else:
|
||||
# state machine is not in progress.
|
||||
self.host_down_queue.append(host_name)
|
||||
else:
|
||||
DLOG.debug("Not adding duplicate host down queue entry")
|
||||
else:
|
||||
self.host_up_queue.append(host_name)
|
||||
|
||||
@ -1009,8 +1058,11 @@ def _run_state_machine():
|
||||
|
||||
_L3Rebalance.state_machine_in_progress = True
|
||||
|
||||
_L3Rebalance.check_abort()
|
||||
|
||||
my_state = _L3Rebalance.get_state()
|
||||
DLOG.debug("Network Rebalance State %s" % my_state)
|
||||
|
||||
if my_state == L3_REBALANCE_STATE.GET_NETWORK_AGENTS:
|
||||
|
||||
_L3Rebalance.reinit()
|
||||
|
Loading…
Reference in New Issue
Block a user