Support multiple processes on Cinder Backup

Cinder Backup has always been run as a single process, and this works
fine if the number of backups is small, but there are deployments where
a big number of backups are run simultaneously, and in such cases we
will find a bottleneck on CPU intensive operations, like compression and
SHA calculations, since we are using a single process and therefore a
single CPU core.

This patch adds support to define the number of processes we want to run
on the backup service using a new "backup_processes" configuration
option.

When running multiple processes they will all be running as children of
a parent process, but when running a single process it will run on its
own.

To preven race conditions and avoid the backup service from becoming
available before it should only the first process will do the initial
cleanup and all the others will wait on a lock until it is released by
this first process.

Change-Id: Ib43095024754a6219eb51cc0663913fac10bb642
This commit is contained in:
Gorka Eguileor 2018-02-05 17:43:39 +01:00
parent 9d894b5232
commit 373b524041
7 changed files with 111 additions and 8 deletions

View File

@ -111,6 +111,7 @@ class BackupManager(manager.ThreadPoolManager):
super(BackupManager, self).__init__(*args, **kwargs)
self.is_initialized = False
self._set_tpool_size(CONF.backup_native_threads_pool_size)
self._process_number = kwargs.get('process_number', 1)
@property
def driver_name(self):
@ -175,7 +176,17 @@ class BackupManager(manager.ThreadPoolManager):
self.backup_rpcapi = backup_rpcapi.BackupAPI()
self.volume_rpcapi = volume_rpcapi.VolumeAPI()
@utils.synchronized('backup-pgid-%s' % os.getpgrp(),
external=True, delay=0.1)
def _cleanup_incomplete_backup_operations(self, ctxt):
# Only the first launched process should do the cleanup, the others
# have waited on the lock for the first one to finish the cleanup and
# can now continue with the start process.
if self._process_number != 1:
LOG.debug("Process #%s (pgid=%s) skips cleanup.",
self._process_number, os.getpgrp())
return
LOG.info("Cleaning up incomplete backup operations.")
# TODO(smulcahy) implement full resume of backup and restore

View File

@ -27,6 +27,7 @@ import sys
import eventlet
eventlet.monkey_patch()
from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_log import log as logging
from oslo_privsep import priv_context
@ -39,6 +40,7 @@ i18n.enable_lazy()
# Need to register global_opts
from cinder.common import config # noqa
from cinder.db import api as session
from cinder import objects
from cinder import service
from cinder import utils
@ -47,6 +49,15 @@ from cinder import version
CONF = cfg.CONF
backup_workers_opt = cfg.IntOpt(
'backup_workers',
default=1, min=1, max=processutils.get_worker_count(),
help='Number of backup processes to launch. Improves performance with '
'concurrent backups.')
CONF.register_opt(backup_workers_opt)
LOG = None
# NOTE(mriedem): The default backup driver uses swift and performs read/write
# operations in a thread. swiftclient will log requests and responses at DEBUG
# level, which can cause a thread switch and break the backup operation. So we
@ -54,6 +65,22 @@ CONF = cfg.CONF
_EXTRA_DEFAULT_LOG_LEVELS = ['swiftclient=WARN']
def _launch_backup_process(launcher, num_process):
try:
server = service.Service.create(binary='cinder-backup',
coordination=True,
process_number=num_process + 1)
except Exception:
LOG.exception('Backup service %s failed to start.', CONF.host)
sys.exit(1)
else:
# Dispose of the whole DB connection pool here before
# starting another process. Otherwise we run into cases where
# child processes share DB connections which results in errors.
session.dispose_engine()
launcher.launch_service(server)
def main():
objects.register_all()
gmr_opts.set_defaults(CONF)
@ -67,7 +94,21 @@ def main():
priv_context.init(root_helper=shlex.split(utils.get_root_helper()))
utils.monkey_patch()
gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)
server = service.Service.create(binary='cinder-backup',
coordination=True)
service.serve(server)
service.wait()
global LOG
LOG = logging.getLogger(__name__)
if CONF.backup_workers > 1:
LOG.info('Backup running with %s processes.', CONF.backup_workers)
launcher = service.get_launcher()
for i in range(CONF.backup_workers):
_launch_backup_process(launcher, i)
launcher.wait()
else:
LOG.info('Backup running in single process mode.')
server = service.Service.create(binary='cinder-backup',
coordination=True,
process_number=1)
service.serve(server)
service.wait()

View File

@ -42,6 +42,7 @@ from cinder.backup.drivers import posix as cinder_backup_drivers_posix
from cinder.backup.drivers import swift as cinder_backup_drivers_swift
from cinder.backup.drivers import tsm as cinder_backup_drivers_tsm
from cinder.backup import manager as cinder_backup_manager
from cinder.cmd import backup as cinder_cmd_backup
from cinder.cmd import volume as cinder_cmd_volume
from cinder.common import config as cinder_common_config
import cinder.compute
@ -221,6 +222,7 @@ def list_opts():
cinder_backup_drivers_swift.swiftbackup_service_opts,
cinder_backup_drivers_tsm.tsm_opts,
cinder_backup_manager.backup_manager_opts,
[cinder_cmd_backup.backup_workers_opt],
[cinder_cmd_volume.cluster_opt],
cinder_common_config.core_opts,
cinder_common_config.global_opts,

View File

@ -366,7 +366,7 @@ class Service(service.Service):
def create(cls, host=None, binary=None, topic=None, manager=None,
report_interval=None, periodic_interval=None,
periodic_fuzzy_delay=None, service_name=None,
coordination=False, cluster=None):
coordination=False, cluster=None, **kwargs):
"""Instantiates class and passes back application object.
:param host: defaults to CONF.host
@ -400,7 +400,7 @@ class Service(service.Service):
periodic_fuzzy_delay=periodic_fuzzy_delay,
service_name=service_name,
coordination=coordination,
cluster=cluster)
cluster=cluster, **kwargs)
return service_obj

View File

@ -379,6 +379,21 @@ class BackupTestCase(BaseBackupTest):
self.assertEqual(len(fake_backup_list), mock_backup_cleanup.call_count)
self.assertEqual(len(fake_backup_list), mock_temp_cleanup.call_count)
@mock.patch('cinder.objects.BackupList')
@mock.patch.object(manager.BackupManager, '_cleanup_one_backup')
@mock.patch.object(manager.BackupManager,
'_cleanup_temp_volumes_snapshots_for_one_backup')
def test_cleanup_non_primary_process(self, temp_cleanup_mock,
backup_cleanup_mock, backup_ovo_mock):
"""Test cleanup doesn't run on non primary processes."""
self.backup_mgr._process_number = 2
self.backup_mgr._cleanup_incomplete_backup_operations(self.ctxt)
backup_ovo_mock.get_all_by_host.assert_not_called()
backup_cleanup_mock.assert_not_called()
temp_cleanup_mock.assert_not_called()
def test_cleanup_one_backing_up_volume(self):
"""Test cleanup_one_volume for volume status 'backing-up'."""

View File

@ -93,13 +93,14 @@ class TestCinderBackupCmd(test.TestCase):
super(TestCinderBackupCmd, self).setUp()
sys.argv = ['cinder-backup']
@mock.patch('cinder.cmd.backup._launch_backup_process')
@mock.patch('cinder.service.wait')
@mock.patch('cinder.service.serve')
@mock.patch('cinder.service.Service.create')
@mock.patch('cinder.utils.monkey_patch')
@mock.patch('oslo_log.log.setup')
def test_main(self, log_setup, monkey_patch, service_create, service_serve,
service_wait):
service_wait, launch_mock):
server = service_create.return_value
cinder_backup.main()
@ -109,9 +110,35 @@ class TestCinderBackupCmd(test.TestCase):
log_setup.assert_called_once_with(CONF, "cinder")
monkey_patch.assert_called_once_with()
service_create.assert_called_once_with(binary='cinder-backup',
coordination=True)
coordination=True,
process_number=1)
service_serve.assert_called_once_with(server)
service_wait.assert_called_once_with()
launch_mock.assert_not_called()
@mock.patch('cinder.service.get_launcher')
@mock.patch('cinder.service.Service.create')
@mock.patch('cinder.utils.monkey_patch')
@mock.patch('oslo_log.log.setup')
def test_main_multiprocess(self, log_setup, monkey_patch, service_create,
get_launcher):
CONF.set_override('backup_workers', 2)
cinder_backup.main()
self.assertEqual('cinder', CONF.project)
self.assertEqual(CONF.version, version.version_string())
c1 = mock.call(binary=constants.BACKUP_BINARY,
coordination=True,
process_number=1)
c2 = mock.call(binary=constants.BACKUP_BINARY,
coordination=True,
process_number=2)
service_create.assert_has_calls([c1, c2])
launcher = get_launcher.return_value
self.assertEqual(2, launcher.launch_service.call_count)
launcher.wait.assert_called_once_with()
class TestCinderSchedulerCmd(test.TestCase):

View File

@ -0,0 +1,7 @@
---
features:
- |
Cinder backup now supports running multiple processes to make the most of
the available CPU cores. Performance gains will be significant when
running multiple concurrent backups/restores with compression. The number
of processes is set with `backup_workers` configuration option.