Protect FM API shared data with thread locking

Collectd runs all its python plugins concurrently.

With the expansion of collectd python plugins due
to the obsolesence of rmon collectd core dumps are
being reported during collectd startup when the FM
service on the controller is not running.

Debug of the issue revealed that the core dumps are
due to having no mutex around FM API's shared data.

The required mutex is provided by this update by
adding a while locked expression to the start of
each API.

Also fixed 3 pep8 errors.

Closes-Bug: 1819473

Test Plan:
PASS: Test before and after cases to confirm that without
      the change we see core dumps but with the change the
      API and collectd plugin behavior is correct without
      the core dumps.
PASS: System install with current collectd plugins and fm's
      python API enhanced with locking.
PASS: Have sm stop managing the fmManager process, kill it
      and then restart collectd over and over.
      Should not see any collectd core dumps.
PASS: Verify nfv alarming still works

Change-Id: I3d5ef0bd9cb774299b4c0f3b9e33cddb7c0f776c
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-04-03 15:50:41 -04:00
parent b7e7f911e5
commit 7ae75e24c8
1 changed files with 132 additions and 112 deletions

View File

@ -14,6 +14,9 @@ import copy
from . import constants
import six
import fm_core
import threading
fm_api_lock = threading.Lock()
class ClientException(Exception):
@ -91,7 +94,8 @@ class FaultAPIsBase(object):
sep + data.severity + sep + self._check_val(data.reason_text) +
sep + data.alarm_type + sep + data.probable_cause + sep +
self._check_val(data.proposed_repair_action) + sep +
str(data.service_affecting) + sep + str(data.suppression) + sep)
str(data.service_affecting) + sep +
str(data.suppression) + sep)
@staticmethod
def _str_to_alarm(alarm_str):
@ -172,76 +176,86 @@ class FaultAPIsBase(object):
class FaultAPIs(FaultAPIsBase):
def set_fault(self, data):
self._check_required_attributes(data)
self._validate_attributes(data)
buff = self._alarm_to_str(data)
try:
return fm_core.set(buff)
except (RuntimeError, SystemError, TypeError):
return None
with fm_api_lock:
self._check_required_attributes(data)
self._validate_attributes(data)
buff = self._alarm_to_str(data)
try:
return fm_core.set(buff)
except (RuntimeError, SystemError, TypeError):
return None
def clear_fault(self, alarm_id, entity_instance_id):
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
try:
resp = fm_core.clear(buff)
# resp may be True/False/None after FaultAPIsV2 implementation.
# to keep FaultAPIs the same as before, return False for None case.
if resp is True:
return True
else:
with fm_api_lock:
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
try:
resp = fm_core.clear(buff)
# resp may be True/False/None after FaultAPIsV2
# implementation.
# To keep FaultAPIs the same as before,
# return False for None case.
if resp is True:
return True
else:
return False
except (RuntimeError, SystemError, TypeError):
return False
except (RuntimeError, SystemError, TypeError):
return False
def get_fault(self, alarm_id, entity_instance_id):
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
try:
resp = fm_core.get(buff)
if resp:
return self._str_to_alarm(resp)
except (RuntimeError, SystemError, TypeError):
pass
return None
with fm_api_lock:
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
try:
resp = fm_core.get(buff)
if resp:
return self._str_to_alarm(resp)
except (RuntimeError, SystemError, TypeError):
pass
return None
def clear_all(self, entity_instance_id):
try:
resp = fm_core.clear_all(entity_instance_id)
# resp may be True/False/None after FaultAPIsV2 implementation.
# to keep FaultAPIs the same as before, return False for None case.
if resp is True:
return True
else:
with fm_api_lock:
try:
resp = fm_core.clear_all(entity_instance_id)
# resp may be True/False/None after FaultAPIsV2
# implementation.
# To keep FaultAPIs the same as before,
# return False for None case.
if resp is True:
return True
else:
return False
except (RuntimeError, SystemError, TypeError):
return False
except (RuntimeError, SystemError, TypeError):
return False
def get_faults(self, entity_instance_id):
try:
resp = fm_core.get_by_eid(entity_instance_id)
if resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
except (RuntimeError, SystemError, TypeError):
pass
return None
with fm_api_lock:
try:
resp = fm_core.get_by_eid(entity_instance_id)
if resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
except (RuntimeError, SystemError, TypeError):
pass
return None
def get_faults_by_id(self, alarm_id):
try:
resp = fm_core.get_by_aid(alarm_id)
if resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
except (RuntimeError, SystemError, TypeError):
pass
return None
with fm_api_lock:
try:
resp = fm_core.get_by_aid(alarm_id)
if resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
except (RuntimeError, SystemError, TypeError):
pass
return None
class FaultAPIsV2(FaultAPIsBase):
@ -251,10 +265,11 @@ class FaultAPIsV2(FaultAPIsBase):
# Exception: 1. Input Alarm format is not valid
# 2. When there is operation failure
def set_fault(self, data):
self._check_required_attributes(data)
self._validate_attributes(data)
buff = self._alarm_to_str(data)
uuid = fm_core.set(buff)
with fm_api_lock:
self._check_required_attributes(data)
self._validate_attributes(data)
buff = self._alarm_to_str(data)
uuid = fm_core.set(buff)
if uuid is None:
raise APIException("Failed to execute set_fault.")
return uuid
@ -264,76 +279,81 @@ class FaultAPIsV2(FaultAPIsBase):
# Alarm doesn't exist: False
# Exception: When there is operation failure
def clear_fault(self, alarm_id, entity_instance_id):
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
resp = fm_core.clear(buff)
if resp is False:
# There is operation failure
raise APIException("Failed to execute clear_fault.")
elif resp is None:
# alarm is not found
return False
else:
return True
with fm_api_lock:
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
resp = fm_core.clear(buff)
if resp is False:
# There is operation failure
raise APIException("Failed to execute clear_fault.")
elif resp is None:
# alarm is not found
return False
else:
return True
# Input: alarm_id, entity_instance_id
# Return: Success: Alarm
# Alarm doesn't exist: None
# Exception: When there is operation failure
def get_fault(self, alarm_id, entity_instance_id):
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
resp = fm_core.get(buff)
if resp is False:
raise APIException("Failed to execute get_fault.")
else:
return self._str_to_alarm(resp) if resp else None
with fm_api_lock:
sep = constants.FM_CLIENT_STR_SEP
buff = (sep + self._check_val(alarm_id) + sep +
self._check_val(entity_instance_id) + sep)
resp = fm_core.get(buff)
if resp is False:
raise APIException("Failed to execute get_fault.")
else:
return self._str_to_alarm(resp) if resp else None
# Input: entity_instance_id
# Return: Success: True
# Alarm doesn't exist: False
# Exception: When there is operation failure
def clear_all(self, entity_instance_id):
resp = fm_core.clear_all(entity_instance_id)
if resp is False:
# There is operation failure
raise APIException("Failed to execute clear_all.")
elif resp is None:
# alarm is not found
return False
else:
return True
with fm_api_lock:
resp = fm_core.clear_all(entity_instance_id)
if resp is False:
# There is operation failure
raise APIException("Failed to execute clear_all.")
elif resp is None:
# alarm is not found
return False
else:
return True
# Input: entity_instance_id
# Return: Success: Alarm list
# Alarm doesn't exist: None
# Exception: When there is operation failure
def get_faults(self, entity_instance_id):
resp = fm_core.get_by_eid(entity_instance_id)
if resp is False:
raise APIException("Failed to execute get_faults.")
elif resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
else:
return None
with fm_api_lock:
resp = fm_core.get_by_eid(entity_instance_id)
if resp is False:
raise APIException("Failed to execute get_faults.")
elif resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
else:
return None
# Input: alarm_id
# Return: Success: Alarm list
# Alarm doesn't exist: None
# Exception: When there is operation failure
def get_faults_by_id(self, alarm_id):
resp = fm_core.get_by_aid(alarm_id)
if resp is False:
raise APIException("Failed to execute get_faults_by_id.")
elif resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
else:
return None
with fm_api_lock:
resp = fm_core.get_by_aid(alarm_id)
if resp is False:
raise APIException("Failed to execute get_faults_by_id.")
elif resp:
data = []
for i in resp:
data.append(self._str_to_alarm(i))
return data
else:
return None