Detect shared storage; handle base cleanup better.

If base image storage is shared, we need to care about remote instances when we clean up. This patch "learns" which storage is shared, and then decides what base images are in use anywhere on the set of compute nodes which share that base storage. This is complicated because shared instance storage doesn't have to be per-cluster. It could for example be per rack. We need to handle that properly. This should resolve bug 1078594. Change-Id: I36d0d6e965b114bb68c8f7b7fd43f8e96b2dd8f5
2012-11-14 18:37:04 +11:00
parent e984c20f16
commit c2de33a0a2
6 changed files with 114 additions and 33 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -73,6 +73,7 @@ from nova import quota
 from nova.scheduler import rpcapi as scheduler_rpcapi
 from nova import utils
 from nova.virt import driver
+from nova.virt import storage_users
 from nova.virt import virtapi
 from nova import volume

@@ -3247,4 +3248,19 @@ class ComputeManager(manager.SchedulerDependentManager):
            return

        all_instances = self.db.instance_get_all(context)
-        self.driver.manage_image_cache(context, all_instances)
+
+        # Determine what other nodes use this storage
+        storage_users.register_storage_use(CONF.instances_path, CONF.host)
+        nodes = storage_users.get_storage_users(CONF.instances_path)
+
+        # Filter all_instances to only include those nodes which share this
+        # storage path.
+        # TODO(mikal): this should be further refactored so that the cache
+        # cleanup code doesn't know what those instances are, just a remote
+        # count, and then this logic should be pushed up the stack.
+        filtered_instances = []
+        for instance in all_instances:
+            if instance['host'] in nodes:
+                filtered_instances.append(instance)
+
+        self.driver.manage_image_cache(context, filtered_instances)
--- a/nova/tests/test_imagecache.py
+++ b/nova/tests/test_imagecache.py
@@ -627,22 +627,22 @@ class ImageCacheManagerTestCase(test.TestCase):
            self.assertEquals(image_cache_manager.corrupt_base_files, [])

    def test_handle_base_image_used_remotely(self):
+        self.stubs.Set(virtutils, 'chown', lambda x, y: None)
        img = '123'

        with self._make_base_file() as fname:
            os.utime(fname, (-1, time.time() - 3601))

            image_cache_manager = imagecache.ImageCacheManager()
+            image_cache_manager.unexplained_images = [fname]
            image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])}
-            image_cache_manager._handle_base_image(img, None)
+            image_cache_manager._handle_base_image(img, fname)

            self.assertEquals(image_cache_manager.unexplained_images, [])
            self.assertEquals(image_cache_manager.removable_base_files, [])
            self.assertEquals(image_cache_manager.corrupt_base_files, [])

    def test_handle_base_image_absent(self):
-        """Ensure we warn for use of a missing base image."""
-
        img = '123'

        with self._intercept_log_messages() as stream:
@@ -942,7 +942,10 @@ class ImageCacheManagerTestCase(test.TestCase):
                     'vm_state': '',
                     'task_state': ''}]

-        self.stubs.Set(db, 'instance_get_all', fake_get_all)
-        compute = importutils.import_object(CONF.compute_manager)
-        compute._run_image_cache_manager_pass(None)
-        self.assertTrue(was['called'])
+        with utils.tempdir() as tmpdir:
+            self.flags(instances_path=tmpdir)
+
+            self.stubs.Set(db, 'instance_get_all', fake_get_all)
+            compute = importutils.import_object(CONF.compute_manager)
+            compute._run_image_cache_manager_pass(None)
+            self.assertTrue(was['called'])
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@@ -708,6 +708,7 @@ class ComputeDriver(object):
        related to other calls into the driver. The prime example is to clean
        the cache and remove images which are no longer of interest.
        """
+        pass

    def add_to_aggregate(self, context, aggregate, host, **kwargs):
        """Add a compute host to an aggregate."""
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -2388,7 +2388,7 @@ class LibvirtDriver(driver.ComputeDriver):
            raise exception.InvalidCPUInfo(reason=m % locals())

    def _create_shared_storage_test_file(self):
-        """Makes tmpfile under CONF.instance_path."""
+        """Makes tmpfile under CONF.instances_path."""
        dirpath = CONF.instances_path
        fd, tmp_file = tempfile.mkstemp(dir=dirpath)
        LOG.debug(_("Creating tmpfile %s to notify to other "
--- a/nova/virt/libvirt/imagecache.py
+++ b/nova/virt/libvirt/imagecache.py
@@ -258,8 +258,8 @@ class ImageCacheManager(object):
                    current_checksum = utils.hash_file(f)

                if current_checksum != stored_checksum:
-                    LOG.error(_('%(id)s (%(base_file)s): image verification '
-                                'failed'),
+                    LOG.error(_('image %(id)s at (%(base_file)s): image '
+                                'verification failed'),
                              {'id': img_id,
                               'base_file': base_file})
                    return False
@@ -268,8 +268,8 @@ class ImageCacheManager(object):
                    return True

            else:
-                LOG.info(_('%(id)s (%(base_file)s): image verification '
-                           'skipped, no hash stored'),
+                LOG.info(_('image %(id)s at (%(base_file)s): image '
+                           'verification skipped, no hash stored'),
                         {'id': img_id,
                          'base_file': base_file})

@@ -325,7 +325,7 @@ class ImageCacheManager(object):
        image_bad = False
        image_in_use = False

-        LOG.info(_('%(id)s (%(base_file)s): checking'),
+        LOG.info(_('image %(id)s at (%(base_file)s): checking'),
                 {'id': img_id,
                  'base_file': base_file})

@@ -346,44 +346,42 @@ class ImageCacheManager(object):
        instances = []
        if img_id in self.used_images:
            local, remote, instances = self.used_images[img_id]
-            if local > 0:
-                LOG.debug(_('%(id)s (%(base_file)s): '
-                            'in use: on this node %(local)d local, '
-                            '%(remote)d on other nodes'),
-                          {'id': img_id,
-                           'base_file': base_file,
-                           'local': local,
-                           'remote': remote})

+            if local > 0 or remote > 0:
                image_in_use = True
+                LOG.info(_('image %(id)s at (%(base_file)s): '
+                           'in use: on this node %(local)d local, '
+                           '%(remote)d on other nodes sharing this instance '
+                           'storage'),
+                         {'id': img_id,
+                          'base_file': base_file,
+                          'local': local,
+                          'remote': remote})
+
                self.active_base_files.append(base_file)

                if not base_file:
-                    LOG.warning(_('%(id)s (%(base_file)s): warning -- an '
-                                  'absent base file is in use! instances: '
-                                  '%(instance_list)s'),
+                    LOG.warning(_('image %(id)s at (%(base_file)s): warning '
+                                  '-- an absent base file is in use! '
+                                  'instances: %(instance_list)s'),
                                {'id': img_id,
                                 'base_file': base_file,
                                 'instance_list': ' '.join(instances)})

-            else:
-                LOG.debug(_('%(id)s (%(base_file)s): in use on (%(remote)d on '
-                            'other nodes)'),
-                          {'id': img_id,
-                           'base_file': base_file,
-                           'remote': remote})
        if image_bad:
            self.corrupt_base_files.append(base_file)

        if base_file:
            if not image_in_use:
-                LOG.debug(_('%(id)s (%(base_file)s): image is not in use'),
+                LOG.debug(_('image %(id)s at (%(base_file)s): image is not in '
+                            'use'),
                          {'id': img_id,
                           'base_file': base_file})
                self.removable_base_files.append(base_file)

            else:
-                LOG.debug(_('%(id)s (%(base_file)s): image is in use'),
+                LOG.debug(_('image %(id)s at (%(base_file)s): image is in '
+                            'use'),
                          {'id': img_id,
                           'base_file': base_file})
                if os.path.exists(base_file):
--- a/nova/virt/storage_users.py
+++ b/nova/virt/storage_users.py
@@ -0,0 +1,63 @@
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+# Copyright 2012 Michael Still and Canonical Inc
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+import json
+import os
+import time
+
+from nova.openstack.common import lockutils
+
+
+TWENTY_FOUR_HOURS = 3600 * 24
+
+
+@lockutils.synchronized('storage-registry-lock', 'nova-', external=True)
+def register_storage_use(storage_path, hostname):
+    """Idenfity the id of this instance storage."""
+
+    # NOTE(mikal): this is required to determine if the instance storage is
+    # shared, which is something that the image cache manager needs to
+    # know. I can imagine other uses as well though.
+
+    d = {}
+    id_path = os.path.join(storage_path, 'compute_nodes')
+    if os.path.exists(id_path):
+        with open(id_path) as f:
+            d = json.loads(f.read())
+
+    d[hostname] = time.time()
+
+    with open(id_path, 'w') as f:
+        f.write(json.dumps(d))
+
+
+@lockutils.synchronized('storage-registry-lock', 'nova-', external=True)
+def get_storage_users(storage_path):
+    """Get a list of all the users of this storage path."""
+
+    d = {}
+    id_path = os.path.join(storage_path, 'compute_nodes')
+    if os.path.exists(id_path):
+        with open(id_path) as f:
+            d = json.loads(f.read())
+
+    recent_users = []
+    for node in d:
+        if time.time() - d[node] < TWENTY_FOUR_HOURS:
+            recent_users.append(node)
+
+    return recent_users