From cbb9121a289603ec003dec098b8fa5918ca98300 Mon Sep 17 00:00:00 2001 From: Marcus Secato Date: Thu, 1 Apr 2021 16:30:13 -0400 Subject: [PATCH] Adjust lock acquiring logic When locking the file descriptor skip_udev_partition_probe was not handling errors thrown by fcntl.flock which was leading controller-0 to degraded state after unlock. This change aims to strengthen that logic by handling the error properly, retrying the lock operation and improving logs. Closes-Bug: 1922256 Signed-off-by: Marcus Secato Change-Id: I000367668744a4e92e20ff9d3f1f8cd717883a46 --- sysinv/sysinv/sysinv/sysinv/agent/manager.py | 21 +------------- sysinv/sysinv/sysinv/sysinv/common/utils.py | 30 +++++++++++++++++++- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/agent/manager.py b/sysinv/sysinv/sysinv/sysinv/agent/manager.py index b6c99244ef..69c227faa3 100644 --- a/sysinv/sysinv/sysinv/sysinv/agent/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/agent/manager.py @@ -34,7 +34,6 @@ Commands (from conductors) are received via RPC calls. """ from __future__ import print_function -import errno from eventlet.green import subprocess import fcntl import fileinput @@ -583,25 +582,7 @@ class AgentManager(service.PeriodicService): """ lock_file_fd = os.open( constants.NETWORK_CONFIG_LOCK_FILE, os.O_CREAT | os.O_RDONLY) - count = 1 - delay = 5 - max_count = 5 - while count <= max_count: - try: - fcntl.flock(lock_file_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) - return lock_file_fd - except IOError as e: - # raise on unrelated IOErrors - if e.errno != errno.EAGAIN: - raise - else: - LOG.info("Could not acquire lock({}): {} ({}/{}), " - "will retry".format(lock_file_fd, str(e), - count, max_count)) - time.sleep(delay) - count += 1 - LOG.error("Failed to acquire lock (fd={})".format(lock_file_fd)) - return 0 + return utils.acquire_file_lock(lock_file_fd) def _release_network_config_lock(self, lockfd): """ Release the lock guarding apply_network_config.sh """ diff --git a/sysinv/sysinv/sysinv/sysinv/common/utils.py b/sysinv/sysinv/sysinv/sysinv/common/utils.py index f8ff7675a9..251d4faf6c 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/common/utils.py @@ -1372,6 +1372,34 @@ def _get_cinder_device_info(dbapi, forihostid): return cinder_device, cinder_size_gib +def acquire_file_lock(lockfd, max_retry=5, wait_interval=5): + """ + This method is to acquire a lock for the given file descriptor to + avoid conflict with other processes trying accessing the same file. + + :returns: fd of the lock, if successful. 0 on error. + """ + count = 1 + while count <= max_retry: + try: + fcntl.flock(lockfd, fcntl.LOCK_EX | fcntl.LOCK_NB) + LOG.debug("Successfully acquired lock (fd={})".format(lockfd)) + return lockfd + except IOError as e: + # raise on unrelated IOErrors + if e.errno != errno.EAGAIN: + raise + else: + LOG.info("Could not acquire lock({}): {} ({}/{}), " + "will retry".format(lockfd, str(e), + count, max_retry)) + time.sleep(wait_interval) + count += 1 + + LOG.error("Failed to acquire lock (fd={}). Stop trying...".format(lockfd)) + return 0 + + def skip_udev_partition_probe(function): def wrapper(*args, **kwargs): """Decorator to skip partition rescanning in udev @@ -1399,7 +1427,7 @@ def skip_udev_partition_probe(function): device_node = kwargs.get('device_node', None) if device_node: with open(device_node, 'r') as f: - fcntl.flock(f, fcntl.LOCK_SH | fcntl.LOCK_NB) + acquire_file_lock(f) try: return function(*args, **kwargs) finally: