From a915b75d001cedd5c555f2746f3511fe458c9258 Mon Sep 17 00:00:00 2001 From: Alexey Ovchinnikov Date: Wed, 7 Sep 2016 15:56:18 +0300 Subject: [PATCH] Fix concurrency issues in container driver Previously container driver failed in concurrent environments deleting shares and share servers. Fix it by adding shared external lock. Change-Id: I543fd1226b3d99e06c736f4d925991d0bda7ea6d Closes-Bug: 1613676 --- contrib/ci/post_test_hook.sh | 3 +- contrib/ci/pre_test_hook.sh | 2 +- manila/share/drivers/container/driver.py | 32 ++++++++++++---- .../share/drivers/container/storage_helper.py | 21 +++++++++-- .../share/drivers/container/test_driver.py | 37 ++++++++++++++++++- .../drivers/container/test_protocol_helper.py | 2 +- .../drivers/container/test_storage_helper.py | 25 +++++++++++++ ...dening-against-races-30c9f517a6392b9d.yaml | 4 ++ 8 files changed, 111 insertions(+), 15 deletions(-) create mode 100644 releasenotes/notes/container-driver-hardening-against-races-30c9f517a6392b9d.yaml diff --git a/contrib/ci/post_test_hook.sh b/contrib/ci/post_test_hook.sh index 2c00ba4507..5ec1a1684b 100755 --- a/contrib/ci/post_test_hook.sh +++ b/contrib/ci/post_test_hook.sh @@ -212,7 +212,8 @@ elif [[ "$DRIVER" == "dummy" ]]; then iniset $TEMPEST_CONFIG share create_networks_when_multitenancy_enabled False iniset $TEMPEST_CONFIG share multi_backend True elif [[ "$DRIVER" == "container" ]]; then - MANILA_TEMPEST_CONCURRENCY=1 + MANILA_TESTS="(^manila_tempest_tests.tests.api)(?=.*\[.*\bbackend\b.*\])" + MANILA_TEMPEST_CONCURRENCY=8 RUN_MANILA_CG_TESTS=False RUN_MANILA_MANAGE_TESTS=False iniset $TEMPEST_CONFIG share run_host_assisted_migration_tests False diff --git a/contrib/ci/pre_test_hook.sh b/contrib/ci/pre_test_hook.sh index 665a6c3983..1b4ab48817 100755 --- a/contrib/ci/pre_test_hook.sh +++ b/contrib/ci/pre_test_hook.sh @@ -157,7 +157,7 @@ elif [[ "$DRIVER" == "zfsonlinux" ]]; then save_configuration "MANILA_ZFSONLINUX_USE_SSH" "True" elif [[ "$DRIVER" == "container" ]]; then save_configuration "SHARE_DRIVER" "manila.share.drivers.container.driver.ContainerShareDriver" - save_configuration "SHARE_BACKING_FILE_SIZE" "32000M" + save_configuration "SHARE_BACKING_FILE_SIZE" "64000M" save_configuration "MANILA_DEFAULT_SHARE_TYPE_EXTRA_SPECS" "'snapshot_support=false'" fi diff --git a/manila/share/drivers/container/driver.py b/manila/share/drivers/container/driver.py index d328192bf2..e5adac20dd 100644 --- a/manila/share/drivers/container/driver.py +++ b/manila/share/drivers/container/driver.py @@ -132,6 +132,7 @@ class ContainerShareDriver(driver.ShareDriver, driver.ExecuteMixin): location = self._get_helper(share).create_share(server_id) return location + @utils.synchronized('container_driver_delete_share_lock', external=True) def delete_share(self, context, share, share_server=None): LOG.debug("Deleting share %(share)s on server '%(server)s'." % {"server": share_server["id"], @@ -143,10 +144,21 @@ class ContainerShareDriver(driver.ShareDriver, driver.ExecuteMixin): server_id, ["umount", "/shares/%s" % share.share_id] ) - self.container.execute( - server_id, - ["rm", "-fR", "/shares/%s" % share.share_id] - ) + # (aovchinnikov): bug 1621784 manifests itself here as well as in + # storage helper. There is a chance that we won't be able to remove + # this directory, despite the fact that it is not shared anymore and + # already contains nothing. In such case the driver should not fail + # share deletion, but issue a warning. + try: + self.container.execute( + server_id, + ["rm", "-fR", "/shares/%s" % share.share_id] + ) + except exception.ProcessExecutionError as e: + LOG.warning(_LW("Failed to remove /shares/%(share)s directory in " + "container %(cont)s."), {"share": share.share_id, + "cont": server_id}) + LOG.error(e) self.storage.remove_storage(share) LOG.debug("Deletion of share %s is completed!", share.share_id) @@ -243,6 +255,7 @@ class ContainerShareDriver(driver.ShareDriver, driver.ExecuteMixin): LOG.debug("Now container %(id)s should be accessible from network " "%(subnet)s by address %(ip)s." % msg_helper) + @utils.synchronized("container_driver_teardown_lock", external=True) def _teardown_server(self, *args, **kwargs): server_id = self._get_container_name(kwargs["server_details"]["id"]) self.container.stop_container(server_id) @@ -259,9 +272,14 @@ class ContainerShareDriver(driver.ShareDriver, driver.ExecuteMixin): continue elif container_id[0] == server_id: LOG.debug("Deleting veth %s.", veth) - self._execute("ovs-vsctl", "--", "del-port", - self.configuration.container_ovs_bridge_name, - veth, run_as_root=True) + try: + self._execute("ovs-vsctl", "--", "del-port", + self.configuration.container_ovs_bridge_name, + veth, run_as_root=True) + except exception.ProcessExecutionError as e: + LOG.warning(_LW("Failed to delete port %s: port " + "vanished."), veth) + LOG.error(e) def _get_veth_state(self): result = self._execute("brctl", "show", diff --git a/manila/share/drivers/container/storage_helper.py b/manila/share/drivers/container/storage_helper.py index a787514ca4..4dbbf11fca 100644 --- a/manila/share/drivers/container/storage_helper.py +++ b/manila/share/drivers/container/storage_helper.py @@ -17,9 +17,10 @@ import os import re from oslo_config import cfg +from oslo_log import log from manila import exception -from manila.i18n import _ +from manila.i18n import _, _LW from manila.share import driver @@ -34,6 +35,7 @@ lv_opts = [ ] CONF.register_opts(lv_opts) +LOG = log.getLogger(__name__) class LVMHelper(driver.ExecuteMixin): @@ -79,8 +81,21 @@ class LVMHelper(driver.ExecuteMixin): run_as_root=True) def remove_storage(self, share): - self._execute("lvremove", "-f", "--autobackup", "n", - self._get_lv_device(share), run_as_root=True) + to_remove = self._get_lv_device(share) + try: + self._execute("umount", to_remove, run_as_root=True) + except exception.ProcessExecutionError as e: + LOG.warning(_LW("Failed to umount helper directory %s."), + to_remove) + LOG.error(e) + # (aovchinnikov): bug 1621784 manifests itself in jamming logical + # volumes, so try removing once and issue warning until it is fixed. + try: + self._execute("lvremove", "-f", "--autobackup", "n", + to_remove, run_as_root=True) + except exception.ProcessExecutionError as e: + LOG.warning(_LW("Failed to remove logical volume %s.") % to_remove) + LOG.error(e) def extend_share(self, share, new_size, share_server=None): lv_device = self._get_lv_device(share) diff --git a/manila/tests/share/drivers/container/test_driver.py b/manila/tests/share/drivers/container/test_driver.py index dd008b228e..57877ff028 100644 --- a/manila/tests/share/drivers/container/test_driver.py +++ b/manila/tests/share/drivers/container/test_driver.py @@ -119,7 +119,7 @@ class ContainerShareDriverTestCase(test.TestCase): self._driver.create_share(self._context, self.share, {'id': 'fake'})) - def test_delete_share(self): + def test_delete_share_ok(self): helper = mock.Mock() self.mock_object(self._driver, "_get_helper", mock.Mock(return_value=helper)) @@ -133,6 +133,19 @@ class ContainerShareDriverTestCase(test.TestCase): ['rm', '-fR', '/shares/fakeshareid'] ) + def test_delete_share_rm_fails(self): + def fake_execute(*args): + if 'rm' in args[1]: + raise exception.ProcessExecutionError() + self.mock_object(driver.LOG, "warning") + self.mock_object(self._driver, "_get_helper") + self.mock_object(self._driver.container, "execute", fake_execute) + self.mock_object(self._driver.storage, 'remove_storage') + + self._driver.delete_share(self._context, self.share, {'id': 'fake'}) + + self.assertTrue(driver.LOG.warning.called) + def test_extend_share(self): share = cont_fakes.fake_share() actual_arguments = [] @@ -193,7 +206,7 @@ class ContainerShareDriverTestCase(test.TestCase): setattr(self._driver.configuration.local_conf, 'network_api_class', 'manila.share.drivers.container.driver.ContainerShareDriver') - driver.LOG.warning = mock.Mock() + self.mock_object(driver.LOG, "warning") self._driver.check_for_setup_error() @@ -239,6 +252,26 @@ class ContainerShareDriverTestCase(test.TestCase): self.assertEqual(expected_arguments.sort(), actual_arguments.sort()) + @ddt.data(['veth0000000'], ['veth0000000' * 2]) + def test__teardown_server_veth_disappeared_mysteriously(self, + list_of_veths): + def fake_ovs_execute(*args, **kwargs): + if len(args) == 3: + return list_of_veths + if len(args) == 4: + return ('fake:manila_b5afb5c1_6011_43c4_8a37_29820e6951a7', '') + if 'del-port' in args: + raise exception.ProcessExecutionError() + else: + return 0 + self.mock_object(driver.LOG, "warning") + self.mock_object(self._driver, "_execute", fake_ovs_execute) + + self._driver._teardown_server( + server_details={"id": "b5afb5c1-6011-43c4-8a37-29820e6951a7"}) + + self.assertTrue(driver.LOG.warning.called) + @ddt.data(['veth0000000'], ['veth0000000' * 2]) def test__teardown_server_check_continuation(self, list_of_veths): def fake_ovs_execute(*args, **kwargs): diff --git a/manila/tests/share/drivers/container/test_protocol_helper.py b/manila/tests/share/drivers/container/test_protocol_helper.py index 3bab5df0f9..af30587d4b 100644 --- a/manila/tests/share/drivers/container/test_protocol_helper.py +++ b/manila/tests/share/drivers/container/test_protocol_helper.py @@ -216,7 +216,7 @@ class DockerCIFSHelperTestCase(test.TestCase): self.DockerCIFSHelper._get_existing_users = mock.Mock() self.DockerCIFSHelper._get_existing_users.side_effect = TypeError self.DockerCIFSHelper._set_users = mock.Mock() - protocol_helper.LOG.warning = mock.Mock() + self.mock_object(protocol_helper.LOG, "warning") self.DockerCIFSHelper._deny_access("fake_share", "fake_server_id", "fake_user2", "rw") diff --git a/manila/tests/share/drivers/container/test_storage_helper.py b/manila/tests/share/drivers/container/test_storage_helper.py index 02521f57fe..5d6c674d76 100644 --- a/manila/tests/share/drivers/container/test_storage_helper.py +++ b/manila/tests/share/drivers/container/test_storage_helper.py @@ -85,6 +85,7 @@ class LVMHelperTestCase(test.TestCase): def test_remove_storage(self): actual_arguments = [] expected_arguments = [ + ('umount', '/dev/manila_docker_volumes/fakeshareid'), ('lvremove', '-f', '--autobackup', 'n', '/dev/manila_docker_volumes/fakeshareid') ] @@ -96,6 +97,30 @@ class LVMHelperTestCase(test.TestCase): self.assertEqual(expected_arguments, actual_arguments) + def test_remove_storage_umount_failed(self): + def fake_execute(*args, **kwargs): + if 'umount' in args: + raise exception.ProcessExecutionError() + + self.mock_object(storage_helper.LOG, "warning") + self.mock_object(self.LVMHelper, "_execute", fake_execute) + + self.LVMHelper.remove_storage(self.share) + + self.assertTrue(storage_helper.LOG.warning.called) + + def test_remove_storage_lvremove_failed(self): + def fake_execute(*args, **kwargs): + if 'lvremove' in args: + raise exception.ProcessExecutionError() + + self.mock_object(storage_helper.LOG, "warning") + self.mock_object(self.LVMHelper, "_execute", fake_execute) + + self.LVMHelper.remove_storage(self.share) + + self.assertTrue(storage_helper.LOG.warning.called) + def test_extend_share(self): actual_arguments = [] expected_arguments = [ diff --git a/releasenotes/notes/container-driver-hardening-against-races-30c9f517a6392b9d.yaml b/releasenotes/notes/container-driver-hardening-against-races-30c9f517a6392b9d.yaml new file mode 100644 index 0000000000..30a7781483 --- /dev/null +++ b/releasenotes/notes/container-driver-hardening-against-races-30c9f517a6392b9d.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - Container driver. Fixed share and share server deletion concurrencies + by adding shared external lock.