Fix NetApp NFS driver to never spawn a native thread again

The NetApp NFS driver is spawning a Python native thread to
schedule image cache cleanups, which can lead to unpredictable
behavior such as thread starvation due to its hunger to preempt
other Cinder's fellow threads.

This patch fixes it by reimplementing the NFS image cache cleanup
task as a green thread with a customized periodic interval.

The patch adds the configuration
option ´netapp_nfs_image_cache_cleanup_interval´ to allow
operators to set an interval in seconds between calls to the image
cleanup task.

Closes-Bug: #2008017
Change-Id: I609a4f056d19f67aa7b8113bb38b9e9083f24124
This commit is contained in:
Fernando Ferraz 2022-08-16 03:48:43 +00:00 committed by Felipe Rodrigues
parent e93c99bfae
commit a869c8e222
6 changed files with 51 additions and 76 deletions

View File

@ -16,7 +16,6 @@
"""Unit tests for the NetApp NFS storage driver."""
import copy
import os
import threading
import time
from unittest import mock
@ -620,36 +619,6 @@ class NetAppNfsDriverTestCase(test.TestCase):
os.path.exists.assert_called_once_with(
'dir/' + fake.CLONE_DESTINATION_NAME)
def test__spawn_clean_cache_job_clean_job_setup(self):
self.driver.cleaning = True
mock_debug_log = self.mock_object(nfs_base.LOG, 'debug')
self.mock_object(utils, 'synchronized', return_value=lambda f: f)
retval = self.driver._spawn_clean_cache_job()
self.assertIsNone(retval)
self.assertEqual(1, mock_debug_log.call_count)
def test__spawn_clean_cache_job_new_clean_job(self):
class FakeTimer(object):
def start(self):
pass
fake_timer = FakeTimer()
self.mock_object(utils, 'synchronized', return_value=lambda f: f)
self.mock_object(fake_timer, 'start')
self.mock_object(nfs_base.LOG, 'debug')
self.mock_object(self.driver, '_clean_image_cache')
self.mock_object(threading, 'Timer', return_value=fake_timer)
retval = self.driver._spawn_clean_cache_job()
self.assertIsNone(retval)
threading.Timer.assert_called_once_with(
0, self.driver._clean_image_cache)
fake_timer.start.assert_called_once_with()
def test_cleanup_volume_on_failure(self):
path = '%s/%s' % (fake.NFS_SHARE, fake.NFS_VOLUME['name'])
mock_local_path = self.mock_object(self.driver, 'local_path')
@ -1077,13 +1046,22 @@ class NetAppNfsDriverTestCase(test.TestCase):
self.driver, '_delete_snapshots_marked_for_deletion')
mock_call_ems_logging = self.mock_object(
self.driver, '_handle_ems_logging')
mock_call_clean_image_cache = self.mock_object(
self.driver, '_clean_image_cache')
# image cache cleanup task can be configured with custom timeout
cache_cleanup_interval = loopingcalls.ONE_HOUR
self.driver.configuration.netapp_nfs_image_cache_cleanup_interval = (
cache_cleanup_interval)
self.driver._add_looping_tasks()
mock_add_task.assert_has_calls([
mock.call(mock_call_snap_cleanup, loopingcalls.ONE_MINUTE,
loopingcalls.ONE_MINUTE),
mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR)])
mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR),
mock.call(mock_call_clean_image_cache, cache_cleanup_interval)
])
def test__clone_from_cache(self):
image_id = 'fake_image_id'

View File

@ -137,7 +137,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase):
mock_debug_log = self.mock_object(nfs_cmode.LOG, 'debug')
self.mock_object(self.driver, 'get_filter_function')
self.mock_object(self.driver, 'get_goodness_function')
self.mock_object(self.driver, '_spawn_clean_cache_job')
self.driver.zapi_client = mock.Mock()
self.mock_object(self.driver, '_get_pool_stats', return_value={})
expected_stats = {
@ -153,7 +152,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase):
retval = self.driver._update_volume_stats()
self.assertIsNone(retval)
self.assertTrue(self.driver._spawn_clean_cache_job.called)
self.assertEqual(1, mock_debug_log.call_count)
self.assertEqual(expected_stats, self.driver._stats)

View File

@ -25,7 +25,6 @@ import copy
import math
import os
import re
import threading
import time
from oslo_concurrency import processutils
@ -115,6 +114,13 @@ class NetAppNfsDriver(driver.ManageableVD,
self._handle_ems_logging,
loopingcalls.ONE_HOUR)
# Add the task that periodically cleanup old expired internal
# image caching.
self.loopingcalls.add_task(
self._clean_image_cache,
self.configuration.netapp_nfs_image_cache_cleanup_interval
)
def _delete_snapshots_marked_for_deletion(self):
snapshots = self.zapi_client.get_snapshots_marked_for_deletion()
for snapshot in snapshots:
@ -546,49 +552,32 @@ class NetAppNfsDriver(driver.ManageableVD,
os.utime(src_path, None)
_do_clone()
@utils.synchronized('clean_cache')
def _spawn_clean_cache_job(self):
"""Spawns a clean task if not running."""
if getattr(self, 'cleaning', None):
LOG.debug('Image cache cleaning in progress. Returning... ')
return
else:
# Set cleaning to True
self.cleaning = True
t = threading.Timer(0, self._clean_image_cache)
t.start()
def _clean_image_cache(self):
"""Clean the image cache files in cache of space crunch."""
try:
LOG.debug('Image cache cleaning in progress.')
thres_size_perc_start = (
self.configuration.thres_avl_size_perc_start)
thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop
for share in getattr(self, '_mounted_shares', []):
try:
total_size, total_avl = self._get_capacity_info(share)
avl_percent = int((float(total_avl) / total_size) * 100)
if avl_percent <= thres_size_perc_start:
LOG.info('Cleaning cache for share %s.', share)
eligible_files = self._find_old_cache_files(share)
threshold_size = int(
(thres_size_perc_stop * total_size) / 100)
bytes_to_free = int(threshold_size - total_avl)
LOG.debug('Files to be queued for deletion %s',
eligible_files)
self._delete_files_till_bytes_free(
eligible_files, share, bytes_to_free)
else:
continue
except Exception as e:
LOG.warning('Exception during cache cleaning'
' %(share)s. Message - %(ex)s',
{'share': share, 'ex': e})
LOG.debug('Image cache cleaning in progress.')
thres_size_perc_start = (
self.configuration.thres_avl_size_perc_start)
thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop
for share in self._mounted_shares:
try:
total_size, total_avl = self._get_capacity_info(share)
avl_percent = int((float(total_avl) / total_size) * 100)
if avl_percent <= thres_size_perc_start:
LOG.info('Cleaning cache for share %s.', share)
eligible_files = self._find_old_cache_files(share)
threshold_size = int(
(thres_size_perc_stop * total_size) / 100)
bytes_to_free = int(threshold_size - total_avl)
LOG.debug('Files to be queued for deletion %s',
eligible_files)
self._delete_files_till_bytes_free(
eligible_files, share, bytes_to_free)
else:
continue
finally:
LOG.debug('Image cache cleaning done.')
self.cleaning = False
except Exception as e:
LOG.warning('Exception during cache cleaning'
' %(share)s. Message - %(ex)s',
{'share': share, 'ex': e})
def _shortlist_del_eligible_files(self, share, old_files):
"""Prepares list of eligible files to be deleted from cache."""

View File

@ -337,7 +337,6 @@ class NetAppCmodeNfsDriver(nfs_base.NetAppNfsDriver,
# Used for service state report
data['replication_enabled'] = self.replication_enabled
self._spawn_clean_cache_job()
self._stats = data
def _get_pool_stats(self, filter_function=None, goodness_function=None):

View File

@ -102,6 +102,11 @@ netapp_cluster_opts = [
'provisioning of block storage volumes should occur.')), ]
netapp_img_cache_opts = [
cfg.IntOpt('netapp_nfs_image_cache_cleanup_interval',
default=600,
min=60,
help=('Sets time in seconds between NFS image cache '
'cleanup tasks.')),
cfg.IntOpt('thres_avl_size_perc_start',
default=20,
help=('If the percentage of available space for an NFS share '

View File

@ -0,0 +1,6 @@
---
fixes:
- |
`Bug #2008017 <https://bugs.launchpad.net/cinder/+bug/2008017>`_: Fixed
NetApp NFS driver to never spawn a native thread avoid thread starvation
and other related issues.