Support multiple processes on Cinder Backup
Cinder Backup has always been run as a single process, and this works fine if the number of backups is small, but there are deployments where a big number of backups are run simultaneously, and in such cases we will find a bottleneck on CPU intensive operations, like compression and SHA calculations, since we are using a single process and therefore a single CPU core. This patch adds support to define the number of processes we want to run on the backup service using a new "backup_processes" configuration option. When running multiple processes they will all be running as children of a parent process, but when running a single process it will run on its own. To preven race conditions and avoid the backup service from becoming available before it should only the first process will do the initial cleanup and all the others will wait on a lock until it is released by this first process. Change-Id: Ib43095024754a6219eb51cc0663913fac10bb642
This commit is contained in:
parent
9d894b5232
commit
373b524041
@ -111,6 +111,7 @@ class BackupManager(manager.ThreadPoolManager):
|
||||
super(BackupManager, self).__init__(*args, **kwargs)
|
||||
self.is_initialized = False
|
||||
self._set_tpool_size(CONF.backup_native_threads_pool_size)
|
||||
self._process_number = kwargs.get('process_number', 1)
|
||||
|
||||
@property
|
||||
def driver_name(self):
|
||||
@ -175,7 +176,17 @@ class BackupManager(manager.ThreadPoolManager):
|
||||
self.backup_rpcapi = backup_rpcapi.BackupAPI()
|
||||
self.volume_rpcapi = volume_rpcapi.VolumeAPI()
|
||||
|
||||
@utils.synchronized('backup-pgid-%s' % os.getpgrp(),
|
||||
external=True, delay=0.1)
|
||||
def _cleanup_incomplete_backup_operations(self, ctxt):
|
||||
# Only the first launched process should do the cleanup, the others
|
||||
# have waited on the lock for the first one to finish the cleanup and
|
||||
# can now continue with the start process.
|
||||
if self._process_number != 1:
|
||||
LOG.debug("Process #%s (pgid=%s) skips cleanup.",
|
||||
self._process_number, os.getpgrp())
|
||||
return
|
||||
|
||||
LOG.info("Cleaning up incomplete backup operations.")
|
||||
|
||||
# TODO(smulcahy) implement full resume of backup and restore
|
||||
|
@ -27,6 +27,7 @@ import sys
|
||||
import eventlet
|
||||
eventlet.monkey_patch()
|
||||
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
from oslo_privsep import priv_context
|
||||
@ -39,6 +40,7 @@ i18n.enable_lazy()
|
||||
|
||||
# Need to register global_opts
|
||||
from cinder.common import config # noqa
|
||||
from cinder.db import api as session
|
||||
from cinder import objects
|
||||
from cinder import service
|
||||
from cinder import utils
|
||||
@ -47,6 +49,15 @@ from cinder import version
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
||||
backup_workers_opt = cfg.IntOpt(
|
||||
'backup_workers',
|
||||
default=1, min=1, max=processutils.get_worker_count(),
|
||||
help='Number of backup processes to launch. Improves performance with '
|
||||
'concurrent backups.')
|
||||
CONF.register_opt(backup_workers_opt)
|
||||
|
||||
LOG = None
|
||||
|
||||
# NOTE(mriedem): The default backup driver uses swift and performs read/write
|
||||
# operations in a thread. swiftclient will log requests and responses at DEBUG
|
||||
# level, which can cause a thread switch and break the backup operation. So we
|
||||
@ -54,6 +65,22 @@ CONF = cfg.CONF
|
||||
_EXTRA_DEFAULT_LOG_LEVELS = ['swiftclient=WARN']
|
||||
|
||||
|
||||
def _launch_backup_process(launcher, num_process):
|
||||
try:
|
||||
server = service.Service.create(binary='cinder-backup',
|
||||
coordination=True,
|
||||
process_number=num_process + 1)
|
||||
except Exception:
|
||||
LOG.exception('Backup service %s failed to start.', CONF.host)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Dispose of the whole DB connection pool here before
|
||||
# starting another process. Otherwise we run into cases where
|
||||
# child processes share DB connections which results in errors.
|
||||
session.dispose_engine()
|
||||
launcher.launch_service(server)
|
||||
|
||||
|
||||
def main():
|
||||
objects.register_all()
|
||||
gmr_opts.set_defaults(CONF)
|
||||
@ -67,7 +94,21 @@ def main():
|
||||
priv_context.init(root_helper=shlex.split(utils.get_root_helper()))
|
||||
utils.monkey_patch()
|
||||
gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)
|
||||
server = service.Service.create(binary='cinder-backup',
|
||||
coordination=True)
|
||||
service.serve(server)
|
||||
service.wait()
|
||||
global LOG
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
if CONF.backup_workers > 1:
|
||||
LOG.info('Backup running with %s processes.', CONF.backup_workers)
|
||||
launcher = service.get_launcher()
|
||||
|
||||
for i in range(CONF.backup_workers):
|
||||
_launch_backup_process(launcher, i)
|
||||
|
||||
launcher.wait()
|
||||
else:
|
||||
LOG.info('Backup running in single process mode.')
|
||||
server = service.Service.create(binary='cinder-backup',
|
||||
coordination=True,
|
||||
process_number=1)
|
||||
service.serve(server)
|
||||
service.wait()
|
||||
|
@ -42,6 +42,7 @@ from cinder.backup.drivers import posix as cinder_backup_drivers_posix
|
||||
from cinder.backup.drivers import swift as cinder_backup_drivers_swift
|
||||
from cinder.backup.drivers import tsm as cinder_backup_drivers_tsm
|
||||
from cinder.backup import manager as cinder_backup_manager
|
||||
from cinder.cmd import backup as cinder_cmd_backup
|
||||
from cinder.cmd import volume as cinder_cmd_volume
|
||||
from cinder.common import config as cinder_common_config
|
||||
import cinder.compute
|
||||
@ -221,6 +222,7 @@ def list_opts():
|
||||
cinder_backup_drivers_swift.swiftbackup_service_opts,
|
||||
cinder_backup_drivers_tsm.tsm_opts,
|
||||
cinder_backup_manager.backup_manager_opts,
|
||||
[cinder_cmd_backup.backup_workers_opt],
|
||||
[cinder_cmd_volume.cluster_opt],
|
||||
cinder_common_config.core_opts,
|
||||
cinder_common_config.global_opts,
|
||||
|
@ -366,7 +366,7 @@ class Service(service.Service):
|
||||
def create(cls, host=None, binary=None, topic=None, manager=None,
|
||||
report_interval=None, periodic_interval=None,
|
||||
periodic_fuzzy_delay=None, service_name=None,
|
||||
coordination=False, cluster=None):
|
||||
coordination=False, cluster=None, **kwargs):
|
||||
"""Instantiates class and passes back application object.
|
||||
|
||||
:param host: defaults to CONF.host
|
||||
@ -400,7 +400,7 @@ class Service(service.Service):
|
||||
periodic_fuzzy_delay=periodic_fuzzy_delay,
|
||||
service_name=service_name,
|
||||
coordination=coordination,
|
||||
cluster=cluster)
|
||||
cluster=cluster, **kwargs)
|
||||
|
||||
return service_obj
|
||||
|
||||
|
@ -379,6 +379,21 @@ class BackupTestCase(BaseBackupTest):
|
||||
self.assertEqual(len(fake_backup_list), mock_backup_cleanup.call_count)
|
||||
self.assertEqual(len(fake_backup_list), mock_temp_cleanup.call_count)
|
||||
|
||||
@mock.patch('cinder.objects.BackupList')
|
||||
@mock.patch.object(manager.BackupManager, '_cleanup_one_backup')
|
||||
@mock.patch.object(manager.BackupManager,
|
||||
'_cleanup_temp_volumes_snapshots_for_one_backup')
|
||||
def test_cleanup_non_primary_process(self, temp_cleanup_mock,
|
||||
backup_cleanup_mock, backup_ovo_mock):
|
||||
"""Test cleanup doesn't run on non primary processes."""
|
||||
self.backup_mgr._process_number = 2
|
||||
|
||||
self.backup_mgr._cleanup_incomplete_backup_operations(self.ctxt)
|
||||
|
||||
backup_ovo_mock.get_all_by_host.assert_not_called()
|
||||
backup_cleanup_mock.assert_not_called()
|
||||
temp_cleanup_mock.assert_not_called()
|
||||
|
||||
def test_cleanup_one_backing_up_volume(self):
|
||||
"""Test cleanup_one_volume for volume status 'backing-up'."""
|
||||
|
||||
|
@ -93,13 +93,14 @@ class TestCinderBackupCmd(test.TestCase):
|
||||
super(TestCinderBackupCmd, self).setUp()
|
||||
sys.argv = ['cinder-backup']
|
||||
|
||||
@mock.patch('cinder.cmd.backup._launch_backup_process')
|
||||
@mock.patch('cinder.service.wait')
|
||||
@mock.patch('cinder.service.serve')
|
||||
@mock.patch('cinder.service.Service.create')
|
||||
@mock.patch('cinder.utils.monkey_patch')
|
||||
@mock.patch('oslo_log.log.setup')
|
||||
def test_main(self, log_setup, monkey_patch, service_create, service_serve,
|
||||
service_wait):
|
||||
service_wait, launch_mock):
|
||||
server = service_create.return_value
|
||||
|
||||
cinder_backup.main()
|
||||
@ -109,9 +110,35 @@ class TestCinderBackupCmd(test.TestCase):
|
||||
log_setup.assert_called_once_with(CONF, "cinder")
|
||||
monkey_patch.assert_called_once_with()
|
||||
service_create.assert_called_once_with(binary='cinder-backup',
|
||||
coordination=True)
|
||||
coordination=True,
|
||||
process_number=1)
|
||||
service_serve.assert_called_once_with(server)
|
||||
service_wait.assert_called_once_with()
|
||||
launch_mock.assert_not_called()
|
||||
|
||||
@mock.patch('cinder.service.get_launcher')
|
||||
@mock.patch('cinder.service.Service.create')
|
||||
@mock.patch('cinder.utils.monkey_patch')
|
||||
@mock.patch('oslo_log.log.setup')
|
||||
def test_main_multiprocess(self, log_setup, monkey_patch, service_create,
|
||||
get_launcher):
|
||||
CONF.set_override('backup_workers', 2)
|
||||
cinder_backup.main()
|
||||
|
||||
self.assertEqual('cinder', CONF.project)
|
||||
self.assertEqual(CONF.version, version.version_string())
|
||||
|
||||
c1 = mock.call(binary=constants.BACKUP_BINARY,
|
||||
coordination=True,
|
||||
process_number=1)
|
||||
c2 = mock.call(binary=constants.BACKUP_BINARY,
|
||||
coordination=True,
|
||||
process_number=2)
|
||||
service_create.assert_has_calls([c1, c2])
|
||||
|
||||
launcher = get_launcher.return_value
|
||||
self.assertEqual(2, launcher.launch_service.call_count)
|
||||
launcher.wait.assert_called_once_with()
|
||||
|
||||
|
||||
class TestCinderSchedulerCmd(test.TestCase):
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Cinder backup now supports running multiple processes to make the most of
|
||||
the available CPU cores. Performance gains will be significant when
|
||||
running multiple concurrent backups/restores with compression. The number
|
||||
of processes is set with `backup_workers` configuration option.
|
Loading…
Reference in New Issue
Block a user