Merge "Alarm 900.701 raised on failing to remove node taint."
This commit is contained in:
commit
d619a37a6a
@ -4,7 +4,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
from fm_api import constants as fm_constants
|
||||
from fm_api import fm_api
|
||||
import kubernetes
|
||||
|
||||
from kubernetes import __version__ as K8S_MODULE_VERSION
|
||||
from kubernetes.client.models.v1_container_image import V1ContainerImage
|
||||
from kubernetes.client.rest import ApiException
|
||||
@ -15,6 +18,8 @@ from nfv_common.helpers import Result
|
||||
|
||||
K8S_MODULE_MAJOR_VERSION = int(K8S_MODULE_VERSION.split('.', maxsplit=1)[0])
|
||||
|
||||
fmapi = fm_api.FaultAPIs()
|
||||
|
||||
DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.clients.kubernetes_client')
|
||||
|
||||
|
||||
@ -77,13 +82,42 @@ def get_customobjects_api_instance():
|
||||
return client.CustomObjectsApi()
|
||||
|
||||
|
||||
def raise_alarm(node_name):
|
||||
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
node_name)
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=fm_constants.FM_ALARM_ID_USM_NODE_TAINTED,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||
reason_text=("Node tainted."),
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_7,
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8,
|
||||
proposed_repair_action=("Execute 'kubectl taint nodes %s services=disabled:NoExecute-'. "
|
||||
"If it fails, Execute 'system host-lock %s' followed by 'system host-unlock %s'. "
|
||||
"If issue still persists, contact next level of support."
|
||||
% (node_name, node_name, node_name)),
|
||||
service_affecting=True)
|
||||
DLOG.info("Raising alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name))
|
||||
fmapi.set_fault(fault)
|
||||
|
||||
|
||||
def clear_alarm(node_name):
|
||||
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
node_name)
|
||||
DLOG.info("Clearing alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name))
|
||||
fmapi.clear_fault(fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, entity_instance_id)
|
||||
|
||||
|
||||
def taint_node(node_name, effect, key, value):
|
||||
"""
|
||||
Apply a taint to a node
|
||||
"""
|
||||
# Get the client.
|
||||
kube_client = get_client()
|
||||
|
||||
# Retrieve the node to access any existing taints.
|
||||
try:
|
||||
response = kube_client.read_node(node_name)
|
||||
@ -127,6 +161,10 @@ def taint_node(node_name, effect, key, value):
|
||||
new_taint = {"key": key, "value": value, "effect": effect}
|
||||
body["spec"]["taints"].append(new_taint)
|
||||
response = kube_client.patch_node(node_name, body)
|
||||
# Clear taint node alarm if tainting is successful.
|
||||
# Alarm not cleared if taint is already present in the system
|
||||
# or the node is under configuration.
|
||||
clear_alarm(node_name)
|
||||
|
||||
return Result(response)
|
||||
|
||||
@ -156,8 +194,27 @@ def untaint_node(node_name, effect, key):
|
||||
# Preserve any existing taints
|
||||
updated_taints = [taint for taint in taints if taint.key != key or
|
||||
taint.effect != effect]
|
||||
DLOG.info("Updated taints %s" % (updated_taints))
|
||||
body = {"spec": {"taints": updated_taints}}
|
||||
response = kube_client.patch_node(node_name, body)
|
||||
check_taints = kube_client.read_node(node_name)
|
||||
taints = check_taints.spec.taints
|
||||
DLOG.info("Existing taint %s" % (taints))
|
||||
if taints is not None:
|
||||
for taint in taints:
|
||||
if (taint.key == key and taint.effect == effect):
|
||||
DLOG.info("Removing %s:%s taint from node %s failed" % (key,
|
||||
effect, node_name))
|
||||
raise_alarm(node_name)
|
||||
break
|
||||
else:
|
||||
# Taint removed successfully. If there are multiple taints
|
||||
# on the system, removing the 'services' taint will clear the alarm.
|
||||
clear_alarm(node_name)
|
||||
else:
|
||||
# If there is only 'services' taint on the system , then removing the taint
|
||||
# should clear the alarm.
|
||||
clear_alarm(node_name)
|
||||
|
||||
return Result(response)
|
||||
|
||||
|
@ -3,16 +3,19 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
import sys
|
||||
import uuid
|
||||
|
||||
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import host_state
|
||||
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import NFVIInfrastructureAPI
|
||||
from nfv_vim.nfvi.objects.v1 import HOST_AVAIL_STATUS
|
||||
from nfv_vim.nfvi.objects.v1 import HOST_LABEL_KEYS
|
||||
from nfv_vim.nfvi.objects.v1 import HOST_LABEL_VALUES
|
||||
from nfv_vim.nfvi.objects.v1 import HOST_OPER_STATE
|
||||
|
||||
from nfv_unit_tests.tests import testcase
|
||||
from unittest import mock
|
||||
sys.modules['fm_core'] = mock.Mock()
|
||||
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import host_state # noqa: H306,E402 pylint: disable=C0413
|
||||
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import NFVIInfrastructureAPI # noqa: H306,E402 pylint: disable=C0413
|
||||
|
||||
# todo(abailey): use already existing constants
|
||||
CONTROLLER_PERSONALITY = 'controller'
|
||||
|
@ -8,11 +8,12 @@
|
||||
|
||||
import kubernetes
|
||||
from kubernetes.client.rest import ApiException
|
||||
from unittest import mock
|
||||
|
||||
from nfv_plugins.nfvi_plugins.clients import kubernetes_client
|
||||
|
||||
from nfv_unit_tests.tests import testcase
|
||||
import sys
|
||||
from unittest import mock
|
||||
sys.modules['fm_core'] = mock.Mock()
|
||||
from nfv_plugins.nfvi_plugins.clients import kubernetes_client # noqa: H306,E402 pylint: disable=C0413
|
||||
|
||||
|
||||
def mock_load_kube_config(path):
|
||||
|
@ -2701,7 +2701,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'200.001',
|
||||
'700.004',
|
||||
'280.002',
|
||||
'100.119'],
|
||||
'100.119',
|
||||
'900.701'],
|
||||
'timeout': 1800}
|
||||
]
|
||||
},
|
||||
@ -2723,7 +2724,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'200.001',
|
||||
'700.004',
|
||||
'280.002',
|
||||
'100.119'],
|
||||
'100.119',
|
||||
'900.701'],
|
||||
'timeout': 1800}
|
||||
]
|
||||
}
|
||||
@ -2827,7 +2829,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'200.001',
|
||||
'700.004',
|
||||
'280.002',
|
||||
'100.119'],
|
||||
'100.119',
|
||||
'900.701'],
|
||||
'timeout': 1800}
|
||||
]
|
||||
},
|
||||
@ -2850,7 +2853,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'200.001',
|
||||
'700.004',
|
||||
'280.002',
|
||||
'100.119'],
|
||||
'100.119',
|
||||
'900.701'],
|
||||
'timeout': 1800}
|
||||
]
|
||||
},
|
||||
@ -2873,7 +2877,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'200.001',
|
||||
'700.004',
|
||||
'280.002',
|
||||
'100.119'],
|
||||
'100.119',
|
||||
'900.701'],
|
||||
'timeout': 1800}
|
||||
]
|
||||
},
|
||||
@ -2896,7 +2901,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'200.001',
|
||||
'700.004',
|
||||
'280.002',
|
||||
'100.119'],
|
||||
'100.119',
|
||||
'900.701'],
|
||||
'timeout': 1800}
|
||||
]
|
||||
},
|
||||
|
@ -1022,7 +1022,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['storage-0']},
|
||||
_unlock_hosts_stage_as_dict(['storage-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 7200}
|
||||
]
|
||||
},
|
||||
@ -1036,7 +1036,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['storage-1']},
|
||||
_unlock_hosts_stage_as_dict(['storage-1']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 7200}
|
||||
]
|
||||
},
|
||||
@ -1050,7 +1050,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['storage-2']},
|
||||
_unlock_hosts_stage_as_dict(['storage-2']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 7200}
|
||||
]
|
||||
},
|
||||
@ -1064,7 +1064,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['storage-3']},
|
||||
_unlock_hosts_stage_as_dict(['storage-3']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 7200}
|
||||
]
|
||||
},
|
||||
@ -1112,7 +1112,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-0']},
|
||||
_unlock_hosts_stage_as_dict(['controller-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
}
|
||||
@ -1159,7 +1159,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-1']},
|
||||
_unlock_hosts_stage_as_dict(['controller-1']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
@ -1175,7 +1175,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-0']},
|
||||
_unlock_hosts_stage_as_dict(['controller-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
}
|
||||
@ -1246,7 +1246,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-0']},
|
||||
_unlock_hosts_stage_as_dict(['controller-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
@ -1262,7 +1262,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-1']},
|
||||
_unlock_hosts_stage_as_dict(['controller-1']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
@ -1363,7 +1363,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-1']},
|
||||
_unlock_hosts_stage_as_dict(['controller-1']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
@ -1377,7 +1377,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-0']},
|
||||
_unlock_hosts_stage_as_dict(['controller-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
@ -1391,7 +1391,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['storage-0']},
|
||||
_unlock_hosts_stage_as_dict(['storage-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 7200}
|
||||
]
|
||||
},
|
||||
@ -1405,7 +1405,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['storage-1']},
|
||||
_unlock_hosts_stage_as_dict(['storage-1']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 7200}
|
||||
]
|
||||
},
|
||||
@ -1501,7 +1501,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-1']},
|
||||
_unlock_hosts_stage_as_dict(['controller-1']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
@ -1517,7 +1517,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
|
||||
'entity_names': ['controller-0']},
|
||||
_unlock_hosts_stage_as_dict(['controller-0']),
|
||||
{'name': 'wait-data-sync',
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
|
||||
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
|
||||
'timeout': 14400}
|
||||
]
|
||||
},
|
||||
|
@ -1391,6 +1391,7 @@ class SwPatchStrategy(SwUpdateStrategy,
|
||||
'700.004', # VM stopped
|
||||
'280.002', # Subcloud resource out-of-sync
|
||||
'100.119', # PTP alarm for SyncE
|
||||
'900.701', # Node tainted
|
||||
]
|
||||
self._ignore_alarms += IGNORE_ALARMS
|
||||
self._single_controller = single_controller
|
||||
@ -1741,6 +1742,7 @@ class SwUpgradeStrategy(SwUpdateStrategy):
|
||||
'900.201', # Software upgrade auto apply in progress
|
||||
'750.006', # Configuration change requires reapply of cert-manager
|
||||
'100.119', # PTP alarm for SyncE
|
||||
'900.701', # Node tainted
|
||||
]
|
||||
self._ignore_alarms += IGNORE_ALARMS
|
||||
self._single_controller = single_controller
|
||||
@ -2347,6 +2349,7 @@ class SystemConfigUpdateStrategy(SwUpdateStrategy,
|
||||
'750.006', # Configuration change requires reapply of an application
|
||||
'900.010', # System Config Update in progress
|
||||
'900.601', # System Config Update Auto Apply in progress
|
||||
'900.701', # Node tainted
|
||||
]
|
||||
self._ignore_alarms += IGNORE_ALARMS
|
||||
self._single_controller = single_controller
|
||||
@ -2530,6 +2533,7 @@ class FwUpdateStrategy(SwUpdateStrategy):
|
||||
'900.301', # Fw Update Auto Apply in progress
|
||||
'200.001', # Locked Host
|
||||
'100.119', # PTP alarm for SyncE
|
||||
'900.701', # Node tainted
|
||||
]
|
||||
|
||||
self._ignore_alarms += IGNORE_ALARMS
|
||||
@ -2898,6 +2902,7 @@ class KubeRootcaUpdateStrategy(SwUpdateStrategy,
|
||||
'900.008', # Kubernetes rootca update in progress
|
||||
'900.009', # Kubernetes rootca update aborted
|
||||
'900.501', # Kubernetes rootca update auto-apply inprogress
|
||||
'900.701', # Node tainted
|
||||
]
|
||||
# self._ignore_alarms is declared in parent class
|
||||
self._ignore_alarms += IGNORE_ALARMS
|
||||
@ -3299,6 +3304,7 @@ class KubeUpgradeStrategy(SwUpdateStrategy,
|
||||
'750.006', # Configuration change requires reapply of cert-manager
|
||||
'900.007', # Kube Upgrade in progress
|
||||
'900.401', # kube-upgrade-auto-apply-inprogress
|
||||
'900.701', # Node tainted
|
||||
]
|
||||
# self._ignore_alarms is declared in parent class
|
||||
self._ignore_alarms += IGNORE_ALARMS
|
||||
|
Loading…
Reference in New Issue
Block a user