Fix NetApp NFS driver to never spawn a native thread again

The NetApp NFS driver is spawning a Python native thread to schedule image cache cleanups, which can lead to unpredictable behavior such as thread starvation due to its hunger to preempt other Cinder's fellow threads. This patch fixes it by reimplementing the NFS image cache cleanup task as a green thread with a customized periodic interval. The patch adds the configuration option ´netapp_nfs_image_cache_cleanup_interval´ to allow operators to set an interval in seconds between calls to the image cleanup task. Closes-Bug: #2008017 Change-Id: I609a4f056d19f67aa7b8113bb38b9e9083f24124
2022-08-16 03:48:43 +00:00 · 2022-08-16 03:48:43 +00:00 · a869c8e222
commit a869c8e222
parent e93c99bfae
6 changed files with 51 additions and 76 deletions
--- a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_base.py
+++ b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_base.py
@ -16,7 +16,6 @@
 """Unit tests for the NetApp NFS storage driver."""
 import copy
 import os
-import threading
 import time
 from unittest import mock

@ -620,36 +619,6 @@ class NetAppNfsDriverTestCase(test.TestCase):
        os.path.exists.assert_called_once_with(
            'dir/' + fake.CLONE_DESTINATION_NAME)

-    def test__spawn_clean_cache_job_clean_job_setup(self):
-        self.driver.cleaning = True
-        mock_debug_log = self.mock_object(nfs_base.LOG, 'debug')
-        self.mock_object(utils, 'synchronized', return_value=lambda f: f)
-
-        retval = self.driver._spawn_clean_cache_job()
-
-        self.assertIsNone(retval)
-        self.assertEqual(1, mock_debug_log.call_count)
-
-    def test__spawn_clean_cache_job_new_clean_job(self):
-
-        class FakeTimer(object):
-            def start(self):
-                pass
-
-        fake_timer = FakeTimer()
-        self.mock_object(utils, 'synchronized', return_value=lambda f: f)
-        self.mock_object(fake_timer, 'start')
-        self.mock_object(nfs_base.LOG, 'debug')
-        self.mock_object(self.driver, '_clean_image_cache')
-        self.mock_object(threading, 'Timer', return_value=fake_timer)
-
-        retval = self.driver._spawn_clean_cache_job()
-
-        self.assertIsNone(retval)
-        threading.Timer.assert_called_once_with(
-            0, self.driver._clean_image_cache)
-        fake_timer.start.assert_called_once_with()
-
    def test_cleanup_volume_on_failure(self):
        path = '%s/%s' % (fake.NFS_SHARE, fake.NFS_VOLUME['name'])
        mock_local_path = self.mock_object(self.driver, 'local_path')
@ -1077,13 +1046,22 @@ class NetAppNfsDriverTestCase(test.TestCase):
            self.driver, '_delete_snapshots_marked_for_deletion')
        mock_call_ems_logging = self.mock_object(
            self.driver, '_handle_ems_logging')
+        mock_call_clean_image_cache = self.mock_object(
+            self.driver, '_clean_image_cache')
+
+        # image cache cleanup task can be configured with custom timeout
+        cache_cleanup_interval = loopingcalls.ONE_HOUR
+        self.driver.configuration.netapp_nfs_image_cache_cleanup_interval = (
+            cache_cleanup_interval)

        self.driver._add_looping_tasks()

        mock_add_task.assert_has_calls([
            mock.call(mock_call_snap_cleanup, loopingcalls.ONE_MINUTE,
                      loopingcalls.ONE_MINUTE),
-            mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR)])
+            mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR),
+            mock.call(mock_call_clean_image_cache, cache_cleanup_interval)
+        ])

    def test__clone_from_cache(self):
        image_id = 'fake_image_id'
--- a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_cmode.py
+++ b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_nfs_cmode.py
@ -137,7 +137,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase):
        mock_debug_log = self.mock_object(nfs_cmode.LOG, 'debug')
        self.mock_object(self.driver, 'get_filter_function')
        self.mock_object(self.driver, 'get_goodness_function')
-        self.mock_object(self.driver, '_spawn_clean_cache_job')
        self.driver.zapi_client = mock.Mock()
        self.mock_object(self.driver, '_get_pool_stats', return_value={})
        expected_stats = {
@ -153,7 +152,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase):
        retval = self.driver._update_volume_stats()

        self.assertIsNone(retval)
-        self.assertTrue(self.driver._spawn_clean_cache_job.called)
        self.assertEqual(1, mock_debug_log.call_count)
        self.assertEqual(expected_stats, self.driver._stats)

--- a/cinder/volume/drivers/netapp/dataontap/nfs_base.py
+++ b/cinder/volume/drivers/netapp/dataontap/nfs_base.py
@ -25,7 +25,6 @@ import copy
 import math
 import os
 import re
-import threading
 import time

 from oslo_concurrency import processutils
@ -115,6 +114,13 @@ class NetAppNfsDriver(driver.ManageableVD,
            self._handle_ems_logging,
            loopingcalls.ONE_HOUR)

+        # Add the task that periodically cleanup old expired internal
+        # image caching.
+        self.loopingcalls.add_task(
+            self._clean_image_cache,
+            self.configuration.netapp_nfs_image_cache_cleanup_interval
+        )
+
    def _delete_snapshots_marked_for_deletion(self):
        snapshots = self.zapi_client.get_snapshots_marked_for_deletion()
        for snapshot in snapshots:
@ -546,49 +552,32 @@ class NetAppNfsDriver(driver.ManageableVD,
                os.utime(src_path, None)
        _do_clone()

-    @utils.synchronized('clean_cache')
-    def _spawn_clean_cache_job(self):
-        """Spawns a clean task if not running."""
-        if getattr(self, 'cleaning', None):
-            LOG.debug('Image cache cleaning in progress. Returning... ')
-            return
-        else:
-            # Set cleaning to True
-            self.cleaning = True
-            t = threading.Timer(0, self._clean_image_cache)
-            t.start()
-
    def _clean_image_cache(self):
        """Clean the image cache files in cache of space crunch."""
-        try:
-            LOG.debug('Image cache cleaning in progress.')
-            thres_size_perc_start = (
-                self.configuration.thres_avl_size_perc_start)
-            thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop
-            for share in getattr(self, '_mounted_shares', []):
-                try:
-                    total_size, total_avl = self._get_capacity_info(share)
-                    avl_percent = int((float(total_avl) / total_size) * 100)
-                    if avl_percent <= thres_size_perc_start:
-                        LOG.info('Cleaning cache for share %s.', share)
-                        eligible_files = self._find_old_cache_files(share)
-                        threshold_size = int(
-                            (thres_size_perc_stop * total_size) / 100)
-                        bytes_to_free = int(threshold_size - total_avl)
-                        LOG.debug('Files to be queued for deletion %s',
-                                  eligible_files)
-                        self._delete_files_till_bytes_free(
-                            eligible_files, share, bytes_to_free)
-                    else:
-                        continue
-                except Exception as e:
-                    LOG.warning('Exception during cache cleaning'
-                                ' %(share)s. Message - %(ex)s',
-                                {'share': share, 'ex': e})
+        LOG.debug('Image cache cleaning in progress.')
+        thres_size_perc_start = (
+            self.configuration.thres_avl_size_perc_start)
+        thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop
+        for share in self._mounted_shares:
+            try:
+                total_size, total_avl = self._get_capacity_info(share)
+                avl_percent = int((float(total_avl) / total_size) * 100)
+                if avl_percent <= thres_size_perc_start:
+                    LOG.info('Cleaning cache for share %s.', share)
+                    eligible_files = self._find_old_cache_files(share)
+                    threshold_size = int(
+                        (thres_size_perc_stop * total_size) / 100)
+                    bytes_to_free = int(threshold_size - total_avl)
+                    LOG.debug('Files to be queued for deletion %s',
+                              eligible_files)
+                    self._delete_files_till_bytes_free(
+                        eligible_files, share, bytes_to_free)
+                else:
                    continue
-        finally:
-            LOG.debug('Image cache cleaning done.')
-            self.cleaning = False
+            except Exception as e:
+                LOG.warning('Exception during cache cleaning'
+                            ' %(share)s. Message - %(ex)s',
+                            {'share': share, 'ex': e})

    def _shortlist_del_eligible_files(self, share, old_files):
        """Prepares list of eligible files to be deleted from cache."""
--- a/cinder/volume/drivers/netapp/dataontap/nfs_cmode.py
+++ b/cinder/volume/drivers/netapp/dataontap/nfs_cmode.py
@ -337,7 +337,6 @@ class NetAppCmodeNfsDriver(nfs_base.NetAppNfsDriver,
        # Used for service state report
        data['replication_enabled'] = self.replication_enabled

-        self._spawn_clean_cache_job()
        self._stats = data

    def _get_pool_stats(self, filter_function=None, goodness_function=None):
--- a/cinder/volume/drivers/netapp/options.py
+++ b/cinder/volume/drivers/netapp/options.py
@ -102,6 +102,11 @@ netapp_cluster_opts = [
                     'provisioning of block storage volumes should occur.')), ]

 netapp_img_cache_opts = [
+    cfg.IntOpt('netapp_nfs_image_cache_cleanup_interval',
+               default=600,
+               min=60,
+               help=('Sets time in seconds between NFS image cache '
+                     'cleanup tasks.')),
    cfg.IntOpt('thres_avl_size_perc_start',
               default=20,
               help=('If the percentage of available space for an NFS share '
--- a/releasenotes/notes/bug-2008017-netapp-fix-native-threads-04d8f58f4c29b03d.yaml
+++ b/releasenotes/notes/bug-2008017-netapp-fix-native-threads-04d8f58f4c29b03d.yaml
@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    `Bug #2008017 <https://bugs.launchpad.net/cinder/+bug/2008017>`_: Fixed
+    NetApp NFS driver to never spawn a native thread avoid thread starvation
+    and other related issues.