Detect shared storage; handle base cleanup better.

If base image storage is shared, we need to care about remote
instances when we clean up. This patch "learns" which storage is
shared, and then decides what base images are in use anywhere
on the set of compute nodes which share that base storage.

This is complicated because shared instance storage doesn't have
to be per-cluster. It could for example be per rack. We need to
handle that properly.

This should resolve bug 1078594.

Change-Id: I36d0d6e965b114bb68c8f7b7fd43f8e96b2dd8f5
This commit is contained in:
Michael Still
2012-11-14 18:37:04 +11:00
parent e984c20f16
commit c2de33a0a2
6 changed files with 114 additions and 33 deletions

View File

@@ -73,6 +73,7 @@ from nova import quota
from nova.scheduler import rpcapi as scheduler_rpcapi
from nova import utils
from nova.virt import driver
from nova.virt import storage_users
from nova.virt import virtapi
from nova import volume
@@ -3247,4 +3248,19 @@ class ComputeManager(manager.SchedulerDependentManager):
return
all_instances = self.db.instance_get_all(context)
self.driver.manage_image_cache(context, all_instances)
# Determine what other nodes use this storage
storage_users.register_storage_use(CONF.instances_path, CONF.host)
nodes = storage_users.get_storage_users(CONF.instances_path)
# Filter all_instances to only include those nodes which share this
# storage path.
# TODO(mikal): this should be further refactored so that the cache
# cleanup code doesn't know what those instances are, just a remote
# count, and then this logic should be pushed up the stack.
filtered_instances = []
for instance in all_instances:
if instance['host'] in nodes:
filtered_instances.append(instance)
self.driver.manage_image_cache(context, filtered_instances)

View File

@@ -627,22 +627,22 @@ class ImageCacheManagerTestCase(test.TestCase):
self.assertEquals(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_used_remotely(self):
self.stubs.Set(virtutils, 'chown', lambda x, y: None)
img = '123'
with self._make_base_file() as fname:
os.utime(fname, (-1, time.time() - 3601))
image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager.unexplained_images = [fname]
image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])}
image_cache_manager._handle_base_image(img, None)
image_cache_manager._handle_base_image(img, fname)
self.assertEquals(image_cache_manager.unexplained_images, [])
self.assertEquals(image_cache_manager.removable_base_files, [])
self.assertEquals(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_absent(self):
"""Ensure we warn for use of a missing base image."""
img = '123'
with self._intercept_log_messages() as stream:
@@ -942,7 +942,10 @@ class ImageCacheManagerTestCase(test.TestCase):
'vm_state': '',
'task_state': ''}]
self.stubs.Set(db, 'instance_get_all', fake_get_all)
compute = importutils.import_object(CONF.compute_manager)
compute._run_image_cache_manager_pass(None)
self.assertTrue(was['called'])
with utils.tempdir() as tmpdir:
self.flags(instances_path=tmpdir)
self.stubs.Set(db, 'instance_get_all', fake_get_all)
compute = importutils.import_object(CONF.compute_manager)
compute._run_image_cache_manager_pass(None)
self.assertTrue(was['called'])

View File

@@ -708,6 +708,7 @@ class ComputeDriver(object):
related to other calls into the driver. The prime example is to clean
the cache and remove images which are no longer of interest.
"""
pass
def add_to_aggregate(self, context, aggregate, host, **kwargs):
"""Add a compute host to an aggregate."""

View File

@@ -2388,7 +2388,7 @@ class LibvirtDriver(driver.ComputeDriver):
raise exception.InvalidCPUInfo(reason=m % locals())
def _create_shared_storage_test_file(self):
"""Makes tmpfile under CONF.instance_path."""
"""Makes tmpfile under CONF.instances_path."""
dirpath = CONF.instances_path
fd, tmp_file = tempfile.mkstemp(dir=dirpath)
LOG.debug(_("Creating tmpfile %s to notify to other "

View File

@@ -258,8 +258,8 @@ class ImageCacheManager(object):
current_checksum = utils.hash_file(f)
if current_checksum != stored_checksum:
LOG.error(_('%(id)s (%(base_file)s): image verification '
'failed'),
LOG.error(_('image %(id)s at (%(base_file)s): image '
'verification failed'),
{'id': img_id,
'base_file': base_file})
return False
@@ -268,8 +268,8 @@ class ImageCacheManager(object):
return True
else:
LOG.info(_('%(id)s (%(base_file)s): image verification '
'skipped, no hash stored'),
LOG.info(_('image %(id)s at (%(base_file)s): image '
'verification skipped, no hash stored'),
{'id': img_id,
'base_file': base_file})
@@ -325,7 +325,7 @@ class ImageCacheManager(object):
image_bad = False
image_in_use = False
LOG.info(_('%(id)s (%(base_file)s): checking'),
LOG.info(_('image %(id)s at (%(base_file)s): checking'),
{'id': img_id,
'base_file': base_file})
@@ -346,44 +346,42 @@ class ImageCacheManager(object):
instances = []
if img_id in self.used_images:
local, remote, instances = self.used_images[img_id]
if local > 0:
LOG.debug(_('%(id)s (%(base_file)s): '
'in use: on this node %(local)d local, '
'%(remote)d on other nodes'),
{'id': img_id,
'base_file': base_file,
'local': local,
'remote': remote})
if local > 0 or remote > 0:
image_in_use = True
LOG.info(_('image %(id)s at (%(base_file)s): '
'in use: on this node %(local)d local, '
'%(remote)d on other nodes sharing this instance '
'storage'),
{'id': img_id,
'base_file': base_file,
'local': local,
'remote': remote})
self.active_base_files.append(base_file)
if not base_file:
LOG.warning(_('%(id)s (%(base_file)s): warning -- an '
'absent base file is in use! instances: '
'%(instance_list)s'),
LOG.warning(_('image %(id)s at (%(base_file)s): warning '
'-- an absent base file is in use! '
'instances: %(instance_list)s'),
{'id': img_id,
'base_file': base_file,
'instance_list': ' '.join(instances)})
else:
LOG.debug(_('%(id)s (%(base_file)s): in use on (%(remote)d on '
'other nodes)'),
{'id': img_id,
'base_file': base_file,
'remote': remote})
if image_bad:
self.corrupt_base_files.append(base_file)
if base_file:
if not image_in_use:
LOG.debug(_('%(id)s (%(base_file)s): image is not in use'),
LOG.debug(_('image %(id)s at (%(base_file)s): image is not in '
'use'),
{'id': img_id,
'base_file': base_file})
self.removable_base_files.append(base_file)
else:
LOG.debug(_('%(id)s (%(base_file)s): image is in use'),
LOG.debug(_('image %(id)s at (%(base_file)s): image is in '
'use'),
{'id': img_id,
'base_file': base_file})
if os.path.exists(base_file):

View File

@@ -0,0 +1,63 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
# Copyright 2012 Michael Still and Canonical Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import json
import os
import time
from nova.openstack.common import lockutils
TWENTY_FOUR_HOURS = 3600 * 24
@lockutils.synchronized('storage-registry-lock', 'nova-', external=True)
def register_storage_use(storage_path, hostname):
"""Idenfity the id of this instance storage."""
# NOTE(mikal): this is required to determine if the instance storage is
# shared, which is something that the image cache manager needs to
# know. I can imagine other uses as well though.
d = {}
id_path = os.path.join(storage_path, 'compute_nodes')
if os.path.exists(id_path):
with open(id_path) as f:
d = json.loads(f.read())
d[hostname] = time.time()
with open(id_path, 'w') as f:
f.write(json.dumps(d))
@lockutils.synchronized('storage-registry-lock', 'nova-', external=True)
def get_storage_users(storage_path):
"""Get a list of all the users of this storage path."""
d = {}
id_path = os.path.join(storage_path, 'compute_nodes')
if os.path.exists(id_path):
with open(id_path) as f:
d = json.loads(f.read())
recent_users = []
for node in d:
if time.time() - d[node] < TWENTY_FOUR_HOURS:
recent_users.append(node)
return recent_users