From a869c8e222a68af731eb72e9f8ec5f0755db0326 Mon Sep 17 00:00:00 2001 From: Fernando Ferraz Date: Tue, 16 Aug 2022 03:48:43 +0000 Subject: [PATCH] Fix NetApp NFS driver to never spawn a native thread again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NetApp NFS driver is spawning a Python native thread to schedule image cache cleanups, which can lead to unpredictable behavior such as thread starvation due to its hunger to preempt other Cinder's fellow threads. This patch fixes it by reimplementing the NFS image cache cleanup task as a green thread with a customized periodic interval. The patch adds the configuration option ´netapp_nfs_image_cache_cleanup_interval´ to allow operators to set an interval in seconds between calls to the image cleanup task. Closes-Bug: #2008017 Change-Id: I609a4f056d19f67aa7b8113bb38b9e9083f24124 --- .../drivers/netapp/dataontap/test_nfs_base.py | 42 +++-------- .../netapp/dataontap/test_nfs_cmode.py | 2 - .../drivers/netapp/dataontap/nfs_base.py | 71 ++++++++----------- .../drivers/netapp/dataontap/nfs_cmode.py | 1 - cinder/volume/drivers/netapp/options.py | 5 ++ ...p-fix-native-threads-04d8f58f4c29b03d.yaml | 6 ++ 6 files changed, 51 insertions(+), 76 deletions(-) create mode 100644 releasenotes/notes/bug-2008017-netapp-fix-native-threads-04d8f58f4c29b03d.yaml diff --git a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_base.py b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_base.py index 550da728a86..9c6e686cd91 100644 --- a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_base.py +++ b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_base.py @@ -16,7 +16,6 @@ """Unit tests for the NetApp NFS storage driver.""" import copy import os -import threading import time from unittest import mock @@ -620,36 +619,6 @@ class NetAppNfsDriverTestCase(test.TestCase): os.path.exists.assert_called_once_with( 'dir/' + fake.CLONE_DESTINATION_NAME) - def test__spawn_clean_cache_job_clean_job_setup(self): - self.driver.cleaning = True - mock_debug_log = self.mock_object(nfs_base.LOG, 'debug') - self.mock_object(utils, 'synchronized', return_value=lambda f: f) - - retval = self.driver._spawn_clean_cache_job() - - self.assertIsNone(retval) - self.assertEqual(1, mock_debug_log.call_count) - - def test__spawn_clean_cache_job_new_clean_job(self): - - class FakeTimer(object): - def start(self): - pass - - fake_timer = FakeTimer() - self.mock_object(utils, 'synchronized', return_value=lambda f: f) - self.mock_object(fake_timer, 'start') - self.mock_object(nfs_base.LOG, 'debug') - self.mock_object(self.driver, '_clean_image_cache') - self.mock_object(threading, 'Timer', return_value=fake_timer) - - retval = self.driver._spawn_clean_cache_job() - - self.assertIsNone(retval) - threading.Timer.assert_called_once_with( - 0, self.driver._clean_image_cache) - fake_timer.start.assert_called_once_with() - def test_cleanup_volume_on_failure(self): path = '%s/%s' % (fake.NFS_SHARE, fake.NFS_VOLUME['name']) mock_local_path = self.mock_object(self.driver, 'local_path') @@ -1077,13 +1046,22 @@ class NetAppNfsDriverTestCase(test.TestCase): self.driver, '_delete_snapshots_marked_for_deletion') mock_call_ems_logging = self.mock_object( self.driver, '_handle_ems_logging') + mock_call_clean_image_cache = self.mock_object( + self.driver, '_clean_image_cache') + + # image cache cleanup task can be configured with custom timeout + cache_cleanup_interval = loopingcalls.ONE_HOUR + self.driver.configuration.netapp_nfs_image_cache_cleanup_interval = ( + cache_cleanup_interval) self.driver._add_looping_tasks() mock_add_task.assert_has_calls([ mock.call(mock_call_snap_cleanup, loopingcalls.ONE_MINUTE, loopingcalls.ONE_MINUTE), - mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR)]) + mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR), + mock.call(mock_call_clean_image_cache, cache_cleanup_interval) + ]) def test__clone_from_cache(self): image_id = 'fake_image_id' diff --git a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_cmode.py b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_cmode.py index d443234c544..308e22c049c 100644 --- a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_cmode.py +++ b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_cmode.py @@ -137,7 +137,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase): mock_debug_log = self.mock_object(nfs_cmode.LOG, 'debug') self.mock_object(self.driver, 'get_filter_function') self.mock_object(self.driver, 'get_goodness_function') - self.mock_object(self.driver, '_spawn_clean_cache_job') self.driver.zapi_client = mock.Mock() self.mock_object(self.driver, '_get_pool_stats', return_value={}) expected_stats = { @@ -153,7 +152,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase): retval = self.driver._update_volume_stats() self.assertIsNone(retval) - self.assertTrue(self.driver._spawn_clean_cache_job.called) self.assertEqual(1, mock_debug_log.call_count) self.assertEqual(expected_stats, self.driver._stats) diff --git a/cinder/volume/drivers/netapp/dataontap/nfs_base.py b/cinder/volume/drivers/netapp/dataontap/nfs_base.py index 3b8a32266d2..5f913584f91 100644 --- a/cinder/volume/drivers/netapp/dataontap/nfs_base.py +++ b/cinder/volume/drivers/netapp/dataontap/nfs_base.py @@ -25,7 +25,6 @@ import copy import math import os import re -import threading import time from oslo_concurrency import processutils @@ -115,6 +114,13 @@ class NetAppNfsDriver(driver.ManageableVD, self._handle_ems_logging, loopingcalls.ONE_HOUR) + # Add the task that periodically cleanup old expired internal + # image caching. + self.loopingcalls.add_task( + self._clean_image_cache, + self.configuration.netapp_nfs_image_cache_cleanup_interval + ) + def _delete_snapshots_marked_for_deletion(self): snapshots = self.zapi_client.get_snapshots_marked_for_deletion() for snapshot in snapshots: @@ -546,49 +552,32 @@ class NetAppNfsDriver(driver.ManageableVD, os.utime(src_path, None) _do_clone() - @utils.synchronized('clean_cache') - def _spawn_clean_cache_job(self): - """Spawns a clean task if not running.""" - if getattr(self, 'cleaning', None): - LOG.debug('Image cache cleaning in progress. Returning... ') - return - else: - # Set cleaning to True - self.cleaning = True - t = threading.Timer(0, self._clean_image_cache) - t.start() - def _clean_image_cache(self): """Clean the image cache files in cache of space crunch.""" - try: - LOG.debug('Image cache cleaning in progress.') - thres_size_perc_start = ( - self.configuration.thres_avl_size_perc_start) - thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop - for share in getattr(self, '_mounted_shares', []): - try: - total_size, total_avl = self._get_capacity_info(share) - avl_percent = int((float(total_avl) / total_size) * 100) - if avl_percent <= thres_size_perc_start: - LOG.info('Cleaning cache for share %s.', share) - eligible_files = self._find_old_cache_files(share) - threshold_size = int( - (thres_size_perc_stop * total_size) / 100) - bytes_to_free = int(threshold_size - total_avl) - LOG.debug('Files to be queued for deletion %s', - eligible_files) - self._delete_files_till_bytes_free( - eligible_files, share, bytes_to_free) - else: - continue - except Exception as e: - LOG.warning('Exception during cache cleaning' - ' %(share)s. Message - %(ex)s', - {'share': share, 'ex': e}) + LOG.debug('Image cache cleaning in progress.') + thres_size_perc_start = ( + self.configuration.thres_avl_size_perc_start) + thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop + for share in self._mounted_shares: + try: + total_size, total_avl = self._get_capacity_info(share) + avl_percent = int((float(total_avl) / total_size) * 100) + if avl_percent <= thres_size_perc_start: + LOG.info('Cleaning cache for share %s.', share) + eligible_files = self._find_old_cache_files(share) + threshold_size = int( + (thres_size_perc_stop * total_size) / 100) + bytes_to_free = int(threshold_size - total_avl) + LOG.debug('Files to be queued for deletion %s', + eligible_files) + self._delete_files_till_bytes_free( + eligible_files, share, bytes_to_free) + else: continue - finally: - LOG.debug('Image cache cleaning done.') - self.cleaning = False + except Exception as e: + LOG.warning('Exception during cache cleaning' + ' %(share)s. Message - %(ex)s', + {'share': share, 'ex': e}) def _shortlist_del_eligible_files(self, share, old_files): """Prepares list of eligible files to be deleted from cache.""" diff --git a/cinder/volume/drivers/netapp/dataontap/nfs_cmode.py b/cinder/volume/drivers/netapp/dataontap/nfs_cmode.py index 0cccbef649f..c383db6bb3f 100644 --- a/cinder/volume/drivers/netapp/dataontap/nfs_cmode.py +++ b/cinder/volume/drivers/netapp/dataontap/nfs_cmode.py @@ -337,7 +337,6 @@ class NetAppCmodeNfsDriver(nfs_base.NetAppNfsDriver, # Used for service state report data['replication_enabled'] = self.replication_enabled - self._spawn_clean_cache_job() self._stats = data def _get_pool_stats(self, filter_function=None, goodness_function=None): diff --git a/cinder/volume/drivers/netapp/options.py b/cinder/volume/drivers/netapp/options.py index c32bd27990a..a8739a3cbf4 100644 --- a/cinder/volume/drivers/netapp/options.py +++ b/cinder/volume/drivers/netapp/options.py @@ -102,6 +102,11 @@ netapp_cluster_opts = [ 'provisioning of block storage volumes should occur.')), ] netapp_img_cache_opts = [ + cfg.IntOpt('netapp_nfs_image_cache_cleanup_interval', + default=600, + min=60, + help=('Sets time in seconds between NFS image cache ' + 'cleanup tasks.')), cfg.IntOpt('thres_avl_size_perc_start', default=20, help=('If the percentage of available space for an NFS share ' diff --git a/releasenotes/notes/bug-2008017-netapp-fix-native-threads-04d8f58f4c29b03d.yaml b/releasenotes/notes/bug-2008017-netapp-fix-native-threads-04d8f58f4c29b03d.yaml new file mode 100644 index 00000000000..c6b05987df6 --- /dev/null +++ b/releasenotes/notes/bug-2008017-netapp-fix-native-threads-04d8f58f4c29b03d.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + `Bug #2008017 `_: Fixed + NetApp NFS driver to never spawn a native thread avoid thread starvation + and other related issues.