Merge "Apply NoExecute taint to locked nodes"
This commit is contained in:
commit
97524fee3a
@ -893,34 +893,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||||||
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
|
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
|
||||||
|
|
||||||
if self._host_supports_kubernetes(host_personality):
|
if self._host_supports_kubernetes(host_personality):
|
||||||
if True:
|
response['reason'] = 'failed to disable kubernetes services'
|
||||||
# For now, we do not want to apply the NoExecute taint.
|
|
||||||
# When the VIM detects that a service is failed on a host,
|
|
||||||
# it goes through a disable/enable cycle. This would cause
|
|
||||||
# the NoExecute taint to be applied/removed which causes
|
|
||||||
# most pods to be stopped/started. If the pods don't come
|
|
||||||
# back quickly enough the VIM will attempt another
|
|
||||||
# disable/enable, which can go on forever. For now,
|
|
||||||
# we will just avoid tainting hosts.
|
|
||||||
# TODO(bwensley): Rework when support for pure k8s hosts is
|
|
||||||
# added.
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
response['reason'] = 'failed to disable kubernetes services'
|
|
||||||
|
|
||||||
# To disable kubernetes we add the NoExecute taint to the
|
# To disable kubernetes we add the NoExecute taint to the
|
||||||
# node. This removes pods that can be scheduled elsewhere
|
# node. This removes pods that can be scheduled elsewhere
|
||||||
# and prevents new pods from scheduling on the node.
|
# and prevents new pods from scheduling on the node.
|
||||||
future.work(kubernetes_client.taint_node,
|
future.work(kubernetes_client.taint_node,
|
||||||
host_name, "NoExecute", "services", "disabled")
|
host_name, "NoExecute", "services", "disabled")
|
||||||
|
|
||||||
future.result = (yield)
|
future.result = (yield)
|
||||||
|
|
||||||
if not future.result.is_complete():
|
if not future.result.is_complete():
|
||||||
DLOG.error("Kubernetes taint_node failed, operation "
|
DLOG.error("Kubernetes taint_node failed, operation "
|
||||||
"did not complete, host_uuid=%s, host_name=%s."
|
"did not complete, host_uuid=%s, host_name=%s."
|
||||||
% (host_uuid, host_name))
|
% (host_uuid, host_name))
|
||||||
return
|
return
|
||||||
|
|
||||||
response['completed'] = True
|
response['completed'] = True
|
||||||
response['reason'] = ''
|
response['reason'] = ''
|
||||||
|
@ -669,49 +669,45 @@ def query_network_agents(token, host_name, check_fully_up):
|
|||||||
Input parameter check_fully_up set to True will check for
|
Input parameter check_fully_up set to True will check for
|
||||||
both alive and admin_state_up, otherwise only alive is checked.
|
both alive and admin_state_up, otherwise only alive is checked.
|
||||||
"""
|
"""
|
||||||
try:
|
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
|
||||||
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
|
token, host_name)
|
||||||
token, host_name)
|
|
||||||
|
|
||||||
agent_state = 'up'
|
agent_state = 'up'
|
||||||
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
|
alive = False
|
||||||
for supported_agent in supported_agents:
|
admin_state_up = False
|
||||||
found = False
|
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
|
||||||
for agent in result_data:
|
for supported_agent in supported_agents:
|
||||||
agent_type = agent.get('agent_type', '')
|
found = False
|
||||||
host = agent.get('host', '')
|
for agent in result_data:
|
||||||
if (agent_type == supported_agent) and (host == host_name):
|
agent_type = agent.get('agent_type', '')
|
||||||
DLOG.verbose("found agent %s for host %s" %
|
host = agent.get('host', '')
|
||||||
(supported_agent, host_name))
|
if (agent_type == supported_agent) and (host == host_name):
|
||||||
alive = agent.get('alive', False)
|
DLOG.verbose("found agent %s for host %s" %
|
||||||
admin_state_up = agent.get('admin_state_up', False)
|
(supported_agent, host_name))
|
||||||
# found the agent of interest.
|
alive = agent.get('alive', False)
|
||||||
found = True
|
admin_state_up = agent.get('admin_state_up', False)
|
||||||
break
|
# found the agent of interest.
|
||||||
if found:
|
found = True
|
||||||
if check_fully_up:
|
|
||||||
if not (alive and admin_state_up):
|
|
||||||
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
|
|
||||||
" admin_state_up: %s" %
|
|
||||||
(host_name, supported_agent,
|
|
||||||
alive, admin_state_up))
|
|
||||||
agent_state = 'down'
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if not alive:
|
|
||||||
DLOG.verbose("host %s agent %s not alive" %
|
|
||||||
(host_name, supported_agent))
|
|
||||||
agent_state = 'down'
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
DLOG.error("host %s agent %s not present" %
|
|
||||||
(host_name, supported_agent))
|
|
||||||
agent_state = 'down'
|
|
||||||
break
|
break
|
||||||
|
if found:
|
||||||
except Exception as e:
|
if check_fully_up:
|
||||||
DLOG.exception("Caught exception trying to query host %s "
|
if not (alive and admin_state_up):
|
||||||
"agent states: %s" % (host_name, e))
|
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
|
||||||
agent_state = 'down'
|
" admin_state_up: %s" %
|
||||||
|
(host_name, supported_agent,
|
||||||
|
alive, admin_state_up))
|
||||||
|
agent_state = 'down'
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if not alive:
|
||||||
|
DLOG.verbose("host %s agent %s not alive" %
|
||||||
|
(host_name, supported_agent))
|
||||||
|
agent_state = 'down'
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
DLOG.error("host %s agent %s not present" %
|
||||||
|
(host_name, supported_agent))
|
||||||
|
agent_state = 'down'
|
||||||
|
break
|
||||||
|
|
||||||
return agent_state
|
return agent_state
|
||||||
|
@ -38,6 +38,13 @@ class SwMgmtDirector(object):
|
|||||||
"""
|
"""
|
||||||
return self._sw_update
|
return self._sw_update
|
||||||
|
|
||||||
|
@property
|
||||||
|
def single_controller(self):
|
||||||
|
"""
|
||||||
|
Returns whether this is a single controller configuration
|
||||||
|
"""
|
||||||
|
return self._single_controller
|
||||||
|
|
||||||
def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type,
|
def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type,
|
||||||
swift_apply_type, worker_apply_type,
|
swift_apply_type, worker_apply_type,
|
||||||
max_parallel_worker_hosts,
|
max_parallel_worker_hosts,
|
||||||
|
@ -234,9 +234,6 @@ class DisableHostTask(state_machine.StateTask):
|
|||||||
if host.host_service_configured(objects.HOST_SERVICES.GUEST):
|
if host.host_service_configured(objects.HOST_SERVICES.GUEST):
|
||||||
task_work_list.append(DisableHostServicesTaskWork(
|
task_work_list.append(DisableHostServicesTaskWork(
|
||||||
self, host, objects.HOST_SERVICES.GUEST))
|
self, host, objects.HOST_SERVICES.GUEST))
|
||||||
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
|
|
||||||
task_work_list.append(DisableHostServicesTaskWork(
|
|
||||||
self, host, objects.HOST_SERVICES.CONTAINER))
|
|
||||||
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
||||||
task_work_list.append(QueryHypervisorTaskWork(
|
task_work_list.append(QueryHypervisorTaskWork(
|
||||||
self, host, force_pass=True))
|
self, host, force_pass=True))
|
||||||
@ -248,6 +245,17 @@ class DisableHostTask(state_machine.StateTask):
|
|||||||
task_work_list.append(NotifyHostDisabledTaskWork(
|
task_work_list.append(NotifyHostDisabledTaskWork(
|
||||||
self, host, objects.HOST_SERVICES.NETWORK))
|
self, host, objects.HOST_SERVICES.NETWORK))
|
||||||
task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host))
|
task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host))
|
||||||
|
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
|
||||||
|
# Only disable the container services if the host is being locked
|
||||||
|
# and we are not running in a single controller configuration. In
|
||||||
|
# a single controller configuration we keep the container services
|
||||||
|
# running.
|
||||||
|
if self._host.is_locking():
|
||||||
|
from nfv_vim import directors
|
||||||
|
sw_mgmt_director = directors.get_sw_mgmt_director()
|
||||||
|
if not sw_mgmt_director.single_controller:
|
||||||
|
task_work_list.append(DisableHostServicesTaskWork(
|
||||||
|
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||||
task_work_list.append(notify_host_services_task(
|
task_work_list.append(notify_host_services_task(
|
||||||
self, host, force_pass=True))
|
self, host, force_pass=True))
|
||||||
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
||||||
@ -443,8 +451,21 @@ class NotifyDisabledHostTask(state_machine.StateTask):
|
|||||||
Notify Disabled Host Task
|
Notify Disabled Host Task
|
||||||
"""
|
"""
|
||||||
def __init__(self, host):
|
def __init__(self, host):
|
||||||
|
from nfv_vim import objects
|
||||||
|
|
||||||
self._host_reference = weakref.ref(host)
|
self._host_reference = weakref.ref(host)
|
||||||
task_work_list = list()
|
task_work_list = list()
|
||||||
|
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
|
||||||
|
# Only disable the container services if the host is being locked
|
||||||
|
# and we are not running in a single controller configuration. In
|
||||||
|
# a single controller configuration we keep the container services
|
||||||
|
# running.
|
||||||
|
if self._host.is_locking():
|
||||||
|
from nfv_vim import directors
|
||||||
|
sw_mgmt_director = directors.get_sw_mgmt_director()
|
||||||
|
if not sw_mgmt_director.single_controller:
|
||||||
|
task_work_list.append(DisableHostServicesTaskWork(
|
||||||
|
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||||
task_work_list.append(NotifyHostServicesDisabledTaskWork(
|
task_work_list.append(NotifyHostServicesDisabledTaskWork(
|
||||||
self, host, force_pass=True))
|
self, host, force_pass=True))
|
||||||
super(NotifyDisabledHostTask, self).__init__(
|
super(NotifyDisabledHostTask, self).__init__(
|
||||||
|
Loading…
Reference in New Issue
Block a user