libvirt: Race condition leads to instance in error
ImageCacheManager deletes base image while image backend is copying image to the instance path leading instance to go in the error state. Acquired lock before removing image from cache. If libvirt is copying image to the instance path, image cache manager won't be able to remove it until libvirt finishes copying image completely. Closes-Bug: 1256838 Closes-Bug: 1470437 Co-Authored-By: Michael Still <mikal@stillhq.com> Depends-On: I337ce28e2fc516c91bec61ca3639ebff0029ad49 Change-Id: I376cc951922c338669fdf3f83da83e0d3cea1532
This commit is contained in:
parent
0d8a3d90fb
commit
ec9d5e375e
@ -246,3 +246,6 @@ ploop: CommandFilter, ploop, root
|
||||
|
||||
# nova/virt/libvirt/utils.py: 'xend', 'status'
|
||||
xend: CommandFilter, xend, root
|
||||
|
||||
# nova/virt/libvirt/utils.py:
|
||||
touch: CommandFilter, touch, root
|
||||
|
@ -77,6 +77,10 @@ def chown(path, owner):
|
||||
pass
|
||||
|
||||
|
||||
def update_mtime(path):
|
||||
pass
|
||||
|
||||
|
||||
def extract_snapshot(disk_path, source_fmt, out_path, dest_fmt):
|
||||
files[out_path] = ''
|
||||
|
||||
|
@ -20,6 +20,7 @@ import os
|
||||
import time
|
||||
|
||||
import mock
|
||||
from oslo_concurrency import lockutils
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_config import cfg
|
||||
from oslo_log import formatters
|
||||
@ -461,34 +462,32 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
|
||||
[fname])
|
||||
self.assertEqual(image_cache_manager.corrupt_base_files, [])
|
||||
|
||||
def test_handle_base_image_used(self):
|
||||
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
|
||||
@mock.patch.object(libvirt_utils, 'update_mtime')
|
||||
def test_handle_base_image_used(self, mock_mtime):
|
||||
img = '123'
|
||||
|
||||
with self._make_base_file() as fname:
|
||||
os.utime(fname, (-1, time.time() - 3601))
|
||||
|
||||
image_cache_manager = imagecache.ImageCacheManager()
|
||||
image_cache_manager.unexplained_images = [fname]
|
||||
image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])}
|
||||
image_cache_manager._handle_base_image(img, fname)
|
||||
|
||||
mock_mtime.assert_called_once_with(fname)
|
||||
self.assertEqual(image_cache_manager.unexplained_images, [])
|
||||
self.assertEqual(image_cache_manager.removable_base_files, [])
|
||||
self.assertEqual(image_cache_manager.corrupt_base_files, [])
|
||||
|
||||
def test_handle_base_image_used_remotely(self):
|
||||
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
|
||||
@mock.patch.object(libvirt_utils, 'update_mtime')
|
||||
def test_handle_base_image_used_remotely(self, mock_mtime):
|
||||
img = '123'
|
||||
|
||||
with self._make_base_file() as fname:
|
||||
os.utime(fname, (-1, time.time() - 3601))
|
||||
|
||||
image_cache_manager = imagecache.ImageCacheManager()
|
||||
image_cache_manager.unexplained_images = [fname]
|
||||
image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])}
|
||||
image_cache_manager._handle_base_image(img, fname)
|
||||
|
||||
mock_mtime.assert_called_once_with(fname)
|
||||
self.assertEqual(image_cache_manager.unexplained_images, [])
|
||||
self.assertEqual(image_cache_manager.removable_base_files, [])
|
||||
self.assertEqual(image_cache_manager.corrupt_base_files, [])
|
||||
@ -527,9 +526,9 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
|
||||
self.assertEqual(image_cache_manager.removable_base_files, [])
|
||||
self.assertEqual(image_cache_manager.corrupt_base_files, [])
|
||||
|
||||
def test_handle_base_image_checksum_fails(self):
|
||||
@mock.patch.object(libvirt_utils, 'update_mtime')
|
||||
def test_handle_base_image_checksum_fails(self, mock_mtime):
|
||||
self.flags(checksum_base_images=True, group='libvirt')
|
||||
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
|
||||
|
||||
img = '123'
|
||||
|
||||
@ -546,12 +545,15 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
|
||||
image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])}
|
||||
image_cache_manager._handle_base_image(img, fname)
|
||||
|
||||
mock_mtime.assert_called_once_with(fname)
|
||||
self.assertEqual(image_cache_manager.unexplained_images, [])
|
||||
self.assertEqual(image_cache_manager.removable_base_files, [])
|
||||
self.assertEqual(image_cache_manager.corrupt_base_files,
|
||||
[fname])
|
||||
|
||||
def test_verify_base_images(self):
|
||||
@mock.patch.object(libvirt_utils, 'update_mtime')
|
||||
@mock.patch.object(lockutils, 'external_lock')
|
||||
def test_verify_base_images(self, mock_lock, mock_mtime):
|
||||
hashed_1 = '356a192b7913b04c54574d18c28d46e6395428ab'
|
||||
hashed_21 = '472b07b9fcf2c2451e8781e944bf5f77cd8457c8'
|
||||
hashed_22 = '12c6fc06c99a462375eeb3f43dfd832b08ca9e17'
|
||||
@ -607,11 +609,6 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
|
||||
|
||||
self.stub_out('os.path.exists', lambda x: exists(x))
|
||||
|
||||
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
|
||||
|
||||
# We need to stub utime as well
|
||||
self.stub_out('os.utime', lambda x, y: None)
|
||||
|
||||
# Fake up some instances in the instances directory
|
||||
orig_listdir = os.listdir
|
||||
|
||||
@ -857,13 +854,13 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
|
||||
expect_set = set(['swap_123', 'swap_456'])
|
||||
self.assertEqual(image_cache_manager.back_swap_images, expect_set)
|
||||
|
||||
@mock.patch.object(libvirt_utils, 'chown')
|
||||
@mock.patch.object(lockutils, 'external_lock')
|
||||
@mock.patch.object(libvirt_utils, 'update_mtime')
|
||||
@mock.patch('os.path.exists', return_value=True)
|
||||
@mock.patch('os.utime')
|
||||
@mock.patch('os.path.getmtime')
|
||||
@mock.patch('os.remove')
|
||||
def test_age_and_verify_swap_images(self, mock_remove, mock_getmtime,
|
||||
mock_utime, mock_exist, mock_chown):
|
||||
mock_exist, mock_mtime, mock_lock):
|
||||
image_cache_manager = imagecache.ImageCacheManager()
|
||||
expected_remove = set()
|
||||
expected_exist = set(['swap_128', 'swap_256'])
|
||||
@ -894,6 +891,20 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
|
||||
self.assertIn('swap_128', expected_exist)
|
||||
self.assertIn('swap_256', expected_remove)
|
||||
|
||||
@mock.patch.object(utils, 'synchronized')
|
||||
@mock.patch.object(imagecache.ImageCacheManager, '_get_age_of_file',
|
||||
return_value=(True, 100))
|
||||
def test_lock_acquired_on_removing_old_enough_files(self, mock_get_age,
|
||||
mock_synchronized):
|
||||
base_file = '/tmp_age_test'
|
||||
lock_path = os.path.join(CONF.instances_path, 'locks')
|
||||
lock_file = os.path.split(base_file)[-1]
|
||||
image_cache_manager = imagecache.ImageCacheManager()
|
||||
image_cache_manager._remove_old_enough_file(
|
||||
base_file, 60, remove_sig=False, remove_lock=False)
|
||||
mock_synchronized.assert_called_once_with(lock_file, external=True,
|
||||
lock_path=lock_path)
|
||||
|
||||
|
||||
class VerifyChecksumTestCase(test.NoDBTestCase):
|
||||
|
||||
|
@ -25,7 +25,7 @@ from nova.virt import images
|
||||
|
||||
class QemuTestCase(test.NoDBTestCase):
|
||||
def test_qemu_info_with_bad_path(self):
|
||||
self.assertRaises(exception.InvalidDiskInfo,
|
||||
self.assertRaises(exception.DiskNotFound,
|
||||
images.qemu_img_info,
|
||||
'/path/that/does/not/exist')
|
||||
|
||||
|
@ -54,8 +54,7 @@ def qemu_img_info(path, format=None):
|
||||
CONF.import_opt('images_type', 'nova.virt.libvirt.imagebackend',
|
||||
group='libvirt')
|
||||
if not os.path.exists(path) and CONF.libvirt.images_type != 'rbd':
|
||||
msg = (_("Path does not exist %(path)s") % {'path': path})
|
||||
raise exception.InvalidDiskInfo(reason=msg)
|
||||
raise exception.DiskNotFound(location=path)
|
||||
|
||||
try:
|
||||
cmd = ('env', 'LC_ALL=C', 'LANG=C', 'qemu-img', 'info', path)
|
||||
|
@ -531,10 +531,15 @@ class Raw(Image):
|
||||
else:
|
||||
if not os.path.exists(base):
|
||||
prepare_template(target=base, max_size=size, *args, **kwargs)
|
||||
|
||||
# NOTE(mikal): Update the mtime of the base file so the image
|
||||
# cache manager knows it is in use.
|
||||
libvirt_utils.update_mtime(base)
|
||||
self.verify_base_size(base, size)
|
||||
if not os.path.exists(self.path):
|
||||
with fileutils.remove_path_on_error(self.path):
|
||||
copy_raw_image(base, self.path, size)
|
||||
|
||||
self.correct_format()
|
||||
|
||||
def resize_image(self, size):
|
||||
@ -584,6 +589,10 @@ class Qcow2(Image):
|
||||
# Download the unmodified base image unless we already have a copy.
|
||||
if not os.path.exists(base):
|
||||
prepare_template(target=base, max_size=size, *args, **kwargs)
|
||||
|
||||
# NOTE(ankit): Update the mtime of the base file so the image
|
||||
# cache manager knows it is in use.
|
||||
libvirt_utils.update_mtime(base)
|
||||
self.verify_base_size(base, size)
|
||||
|
||||
legacy_backing_size = None
|
||||
|
@ -449,10 +449,18 @@ class ImageCacheManager(imagecache.ImageCacheManager):
|
||||
if not exists:
|
||||
return
|
||||
|
||||
if age < maxage:
|
||||
LOG.info(_LI('Base or swap file too young to remove: %s'),
|
||||
base_file)
|
||||
else:
|
||||
lock_file = os.path.split(base_file)[-1]
|
||||
|
||||
@utils.synchronized(lock_file, external=True,
|
||||
lock_path=self.lock_path)
|
||||
def _inner_remove_old_enough_file():
|
||||
# NOTE(mikal): recheck that the file is old enough, as a new
|
||||
# user of the file might have come along while we were waiting
|
||||
# for the lock
|
||||
exists, age = self._get_age_of_file(base_file)
|
||||
if not exists or age < maxage:
|
||||
return
|
||||
|
||||
LOG.info(_LI('Removing base or swap file: %s'), base_file)
|
||||
try:
|
||||
os.remove(base_file)
|
||||
@ -466,6 +474,11 @@ class ImageCacheManager(imagecache.ImageCacheManager):
|
||||
{'base_file': base_file,
|
||||
'error': e})
|
||||
|
||||
if age < maxage:
|
||||
LOG.info(_LI('Base or swap file too young to remove: %s'),
|
||||
base_file)
|
||||
else:
|
||||
_inner_remove_old_enough_file()
|
||||
if remove_lock:
|
||||
try:
|
||||
# NOTE(jichenjc) The lock file will be constructed first
|
||||
@ -473,7 +486,6 @@ class ImageCacheManager(imagecache.ImageCacheManager):
|
||||
# like nova-9e881789030568a317fad9daae82c5b1c65e0d4a
|
||||
# or nova-03d8e206-6500-4d91-b47d-ee74897f9b4e
|
||||
# according to the original file name
|
||||
lock_file = os.path.split(base_file)[-1]
|
||||
lockutils.remove_external_lock_file(lock_file,
|
||||
lock_file_prefix='nova-', lock_path=self.lock_path)
|
||||
except OSError as e:
|
||||
@ -562,8 +574,7 @@ class ImageCacheManager(imagecache.ImageCacheManager):
|
||||
{'id': img_id,
|
||||
'base_file': base_file})
|
||||
if os.path.exists(base_file):
|
||||
libvirt_utils.chown(base_file, os.getuid())
|
||||
os.utime(base_file, None)
|
||||
libvirt_utils.update_mtime(base_file)
|
||||
|
||||
def _age_and_verify_swap_images(self, context, base_dir):
|
||||
LOG.debug('Verify swap images')
|
||||
@ -571,8 +582,7 @@ class ImageCacheManager(imagecache.ImageCacheManager):
|
||||
for ent in self.back_swap_images:
|
||||
base_file = os.path.join(base_dir, ent)
|
||||
if ent in self.used_swap_images and os.path.exists(base_file):
|
||||
libvirt_utils.chown(base_file, os.getuid())
|
||||
os.utime(base_file, None)
|
||||
libvirt_utils.update_mtime(base_file)
|
||||
elif self.remove_unused_base_images:
|
||||
self._remove_swap_file(base_file)
|
||||
|
||||
|
@ -248,6 +248,14 @@ def chown(path, owner):
|
||||
execute('chown', owner, path, run_as_root=True)
|
||||
|
||||
|
||||
def update_mtime(path):
|
||||
"""Touch a file without being the owner.
|
||||
|
||||
:param path: File bump the mtime on
|
||||
"""
|
||||
execute('touch', '-c', path, run_as_root=True)
|
||||
|
||||
|
||||
def _id_map_to_config(id_map):
|
||||
return "%s:%s:%s" % (id_map.start, id_map.target, id_map.count)
|
||||
|
||||
|
@ -0,0 +1,13 @@
|
||||
---
|
||||
upgrade:
|
||||
- Upgrade the rootwrap configuration for the compute service,
|
||||
so that patches requiring new rootwrap configuration can be
|
||||
tested with grenade.
|
||||
fixes:
|
||||
- In a race condition if base image is deleted by ImageCacheManager
|
||||
while imagebackend is copying the image to instance path, then the
|
||||
instance goes in to error state. In this case when libvirt has
|
||||
changed the base file ownership to libvirt-qemu while imagebackend
|
||||
is copying the image, then we get permission denied error on updating
|
||||
the file access time using os.utime. Fixed this issue by updating the
|
||||
base file access time with root user privileges using 'touch' command.
|
Loading…
Reference in New Issue
Block a user