Fix NetApp NFS driver to never spawn a native thread again
The NetApp NFS driver is spawning a Python native thread to schedule image cache cleanups, which can lead to unpredictable behavior such as thread starvation due to its hunger to preempt other Cinder's fellow threads. This patch fixes it by reimplementing the NFS image cache cleanup task as a green thread with a customized periodic interval. The patch adds the configuration option ´netapp_nfs_image_cache_cleanup_interval´ to allow operators to set an interval in seconds between calls to the image cleanup task. Closes-Bug: #2008017 Change-Id: I609a4f056d19f67aa7b8113bb38b9e9083f24124
This commit is contained in:
parent
e93c99bfae
commit
a869c8e222
@ -16,7 +16,6 @@
|
||||
"""Unit tests for the NetApp NFS storage driver."""
|
||||
import copy
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
@ -620,36 +619,6 @@ class NetAppNfsDriverTestCase(test.TestCase):
|
||||
os.path.exists.assert_called_once_with(
|
||||
'dir/' + fake.CLONE_DESTINATION_NAME)
|
||||
|
||||
def test__spawn_clean_cache_job_clean_job_setup(self):
|
||||
self.driver.cleaning = True
|
||||
mock_debug_log = self.mock_object(nfs_base.LOG, 'debug')
|
||||
self.mock_object(utils, 'synchronized', return_value=lambda f: f)
|
||||
|
||||
retval = self.driver._spawn_clean_cache_job()
|
||||
|
||||
self.assertIsNone(retval)
|
||||
self.assertEqual(1, mock_debug_log.call_count)
|
||||
|
||||
def test__spawn_clean_cache_job_new_clean_job(self):
|
||||
|
||||
class FakeTimer(object):
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
fake_timer = FakeTimer()
|
||||
self.mock_object(utils, 'synchronized', return_value=lambda f: f)
|
||||
self.mock_object(fake_timer, 'start')
|
||||
self.mock_object(nfs_base.LOG, 'debug')
|
||||
self.mock_object(self.driver, '_clean_image_cache')
|
||||
self.mock_object(threading, 'Timer', return_value=fake_timer)
|
||||
|
||||
retval = self.driver._spawn_clean_cache_job()
|
||||
|
||||
self.assertIsNone(retval)
|
||||
threading.Timer.assert_called_once_with(
|
||||
0, self.driver._clean_image_cache)
|
||||
fake_timer.start.assert_called_once_with()
|
||||
|
||||
def test_cleanup_volume_on_failure(self):
|
||||
path = '%s/%s' % (fake.NFS_SHARE, fake.NFS_VOLUME['name'])
|
||||
mock_local_path = self.mock_object(self.driver, 'local_path')
|
||||
@ -1077,13 +1046,22 @@ class NetAppNfsDriverTestCase(test.TestCase):
|
||||
self.driver, '_delete_snapshots_marked_for_deletion')
|
||||
mock_call_ems_logging = self.mock_object(
|
||||
self.driver, '_handle_ems_logging')
|
||||
mock_call_clean_image_cache = self.mock_object(
|
||||
self.driver, '_clean_image_cache')
|
||||
|
||||
# image cache cleanup task can be configured with custom timeout
|
||||
cache_cleanup_interval = loopingcalls.ONE_HOUR
|
||||
self.driver.configuration.netapp_nfs_image_cache_cleanup_interval = (
|
||||
cache_cleanup_interval)
|
||||
|
||||
self.driver._add_looping_tasks()
|
||||
|
||||
mock_add_task.assert_has_calls([
|
||||
mock.call(mock_call_snap_cleanup, loopingcalls.ONE_MINUTE,
|
||||
loopingcalls.ONE_MINUTE),
|
||||
mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR)])
|
||||
mock.call(mock_call_ems_logging, loopingcalls.ONE_HOUR),
|
||||
mock.call(mock_call_clean_image_cache, cache_cleanup_interval)
|
||||
])
|
||||
|
||||
def test__clone_from_cache(self):
|
||||
image_id = 'fake_image_id'
|
||||
|
@ -137,7 +137,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase):
|
||||
mock_debug_log = self.mock_object(nfs_cmode.LOG, 'debug')
|
||||
self.mock_object(self.driver, 'get_filter_function')
|
||||
self.mock_object(self.driver, 'get_goodness_function')
|
||||
self.mock_object(self.driver, '_spawn_clean_cache_job')
|
||||
self.driver.zapi_client = mock.Mock()
|
||||
self.mock_object(self.driver, '_get_pool_stats', return_value={})
|
||||
expected_stats = {
|
||||
@ -153,7 +152,6 @@ class NetAppCmodeNfsDriverTestCase(test.TestCase):
|
||||
retval = self.driver._update_volume_stats()
|
||||
|
||||
self.assertIsNone(retval)
|
||||
self.assertTrue(self.driver._spawn_clean_cache_job.called)
|
||||
self.assertEqual(1, mock_debug_log.call_count)
|
||||
self.assertEqual(expected_stats, self.driver._stats)
|
||||
|
||||
|
@ -25,7 +25,6 @@ import copy
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
|
||||
from oslo_concurrency import processutils
|
||||
@ -115,6 +114,13 @@ class NetAppNfsDriver(driver.ManageableVD,
|
||||
self._handle_ems_logging,
|
||||
loopingcalls.ONE_HOUR)
|
||||
|
||||
# Add the task that periodically cleanup old expired internal
|
||||
# image caching.
|
||||
self.loopingcalls.add_task(
|
||||
self._clean_image_cache,
|
||||
self.configuration.netapp_nfs_image_cache_cleanup_interval
|
||||
)
|
||||
|
||||
def _delete_snapshots_marked_for_deletion(self):
|
||||
snapshots = self.zapi_client.get_snapshots_marked_for_deletion()
|
||||
for snapshot in snapshots:
|
||||
@ -546,49 +552,32 @@ class NetAppNfsDriver(driver.ManageableVD,
|
||||
os.utime(src_path, None)
|
||||
_do_clone()
|
||||
|
||||
@utils.synchronized('clean_cache')
|
||||
def _spawn_clean_cache_job(self):
|
||||
"""Spawns a clean task if not running."""
|
||||
if getattr(self, 'cleaning', None):
|
||||
LOG.debug('Image cache cleaning in progress. Returning... ')
|
||||
return
|
||||
else:
|
||||
# Set cleaning to True
|
||||
self.cleaning = True
|
||||
t = threading.Timer(0, self._clean_image_cache)
|
||||
t.start()
|
||||
|
||||
def _clean_image_cache(self):
|
||||
"""Clean the image cache files in cache of space crunch."""
|
||||
try:
|
||||
LOG.debug('Image cache cleaning in progress.')
|
||||
thres_size_perc_start = (
|
||||
self.configuration.thres_avl_size_perc_start)
|
||||
thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop
|
||||
for share in getattr(self, '_mounted_shares', []):
|
||||
try:
|
||||
total_size, total_avl = self._get_capacity_info(share)
|
||||
avl_percent = int((float(total_avl) / total_size) * 100)
|
||||
if avl_percent <= thres_size_perc_start:
|
||||
LOG.info('Cleaning cache for share %s.', share)
|
||||
eligible_files = self._find_old_cache_files(share)
|
||||
threshold_size = int(
|
||||
(thres_size_perc_stop * total_size) / 100)
|
||||
bytes_to_free = int(threshold_size - total_avl)
|
||||
LOG.debug('Files to be queued for deletion %s',
|
||||
eligible_files)
|
||||
self._delete_files_till_bytes_free(
|
||||
eligible_files, share, bytes_to_free)
|
||||
else:
|
||||
continue
|
||||
except Exception as e:
|
||||
LOG.warning('Exception during cache cleaning'
|
||||
' %(share)s. Message - %(ex)s',
|
||||
{'share': share, 'ex': e})
|
||||
LOG.debug('Image cache cleaning in progress.')
|
||||
thres_size_perc_start = (
|
||||
self.configuration.thres_avl_size_perc_start)
|
||||
thres_size_perc_stop = self.configuration.thres_avl_size_perc_stop
|
||||
for share in self._mounted_shares:
|
||||
try:
|
||||
total_size, total_avl = self._get_capacity_info(share)
|
||||
avl_percent = int((float(total_avl) / total_size) * 100)
|
||||
if avl_percent <= thres_size_perc_start:
|
||||
LOG.info('Cleaning cache for share %s.', share)
|
||||
eligible_files = self._find_old_cache_files(share)
|
||||
threshold_size = int(
|
||||
(thres_size_perc_stop * total_size) / 100)
|
||||
bytes_to_free = int(threshold_size - total_avl)
|
||||
LOG.debug('Files to be queued for deletion %s',
|
||||
eligible_files)
|
||||
self._delete_files_till_bytes_free(
|
||||
eligible_files, share, bytes_to_free)
|
||||
else:
|
||||
continue
|
||||
finally:
|
||||
LOG.debug('Image cache cleaning done.')
|
||||
self.cleaning = False
|
||||
except Exception as e:
|
||||
LOG.warning('Exception during cache cleaning'
|
||||
' %(share)s. Message - %(ex)s',
|
||||
{'share': share, 'ex': e})
|
||||
|
||||
def _shortlist_del_eligible_files(self, share, old_files):
|
||||
"""Prepares list of eligible files to be deleted from cache."""
|
||||
|
@ -337,7 +337,6 @@ class NetAppCmodeNfsDriver(nfs_base.NetAppNfsDriver,
|
||||
# Used for service state report
|
||||
data['replication_enabled'] = self.replication_enabled
|
||||
|
||||
self._spawn_clean_cache_job()
|
||||
self._stats = data
|
||||
|
||||
def _get_pool_stats(self, filter_function=None, goodness_function=None):
|
||||
|
@ -102,6 +102,11 @@ netapp_cluster_opts = [
|
||||
'provisioning of block storage volumes should occur.')), ]
|
||||
|
||||
netapp_img_cache_opts = [
|
||||
cfg.IntOpt('netapp_nfs_image_cache_cleanup_interval',
|
||||
default=600,
|
||||
min=60,
|
||||
help=('Sets time in seconds between NFS image cache '
|
||||
'cleanup tasks.')),
|
||||
cfg.IntOpt('thres_avl_size_perc_start',
|
||||
default=20,
|
||||
help=('If the percentage of available space for an NFS share '
|
||||
|
@ -0,0 +1,6 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
`Bug #2008017 <https://bugs.launchpad.net/cinder/+bug/2008017>`_: Fixed
|
||||
NetApp NFS driver to never spawn a native thread avoid thread starvation
|
||||
and other related issues.
|
Loading…
x
Reference in New Issue
Block a user