2013-09-20 01:00:54 +08:00
|
|
|
# Copyright (c) 2010-2012 OpenStack Foundation
|
2010-07-12 17:03:45 -05:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
|
|
# implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
2016-10-30 22:24:18 -07:00
|
|
|
import collections
|
2016-02-29 13:14:56 +00:00
|
|
|
import json
|
2010-07-12 17:03:45 -05:00
|
|
|
import unittest
|
|
|
|
import os
|
2012-12-17 06:39:25 -05:00
|
|
|
import mock
|
2010-07-12 17:03:45 -05:00
|
|
|
from gzip import GzipFile
|
|
|
|
from shutil import rmtree
|
2019-07-15 18:07:51 -07:00
|
|
|
import six
|
2015-07-07 22:46:37 +05:30
|
|
|
import six.moves.cPickle as pickle
|
2010-10-29 15:26:35 -07:00
|
|
|
import time
|
2010-11-16 11:06:39 -08:00
|
|
|
import tempfile
|
2013-07-20 13:44:11 -07:00
|
|
|
from contextlib import contextmanager, closing
|
2015-08-21 18:14:55 -07:00
|
|
|
from collections import defaultdict
|
2014-05-29 00:54:07 -07:00
|
|
|
from errno import ENOENT, ENOTEMPTY, ENOTDIR
|
2013-07-20 13:44:11 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
from eventlet.green import subprocess
|
2018-02-26 21:23:55 +09:00
|
|
|
from eventlet import Timeout, sleep
|
2013-07-20 13:44:11 -07:00
|
|
|
|
2015-08-21 18:14:55 -07:00
|
|
|
from test.unit import (debug_logger, patch_policies, make_timestamp_iter,
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
mocked_http_conn, mock_check_drive, skip_if_no_xattrs)
|
2013-10-07 12:10:31 +00:00
|
|
|
from swift.common import utils
|
2015-08-25 11:24:49 +10:00
|
|
|
from swift.common.utils import (hash_path, mkdirs, normalize_timestamp,
|
|
|
|
storage_directory)
|
2010-07-12 17:03:45 -05:00
|
|
|
from swift.common import ring
|
2013-07-29 15:49:37 -04:00
|
|
|
from swift.obj import diskfile, replicator as object_replicator
|
2014-06-23 12:52:50 -07:00
|
|
|
from swift.common.storage_policy import StoragePolicy, POLICIES
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
def _ips(*args, **kwargs):
|
2010-11-05 09:15:31 -07:00
|
|
|
return ['127.0.0.0']
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2010-10-29 15:26:35 -07:00
|
|
|
def mock_http_connect(status):
|
|
|
|
|
|
|
|
class FakeConn(object):
|
|
|
|
|
|
|
|
def __init__(self, status, *args, **kwargs):
|
|
|
|
self.status = status
|
|
|
|
self.reason = 'Fake'
|
|
|
|
self.host = args[0]
|
|
|
|
self.port = args[1]
|
|
|
|
self.method = args[4]
|
|
|
|
self.path = args[5]
|
|
|
|
self.with_exc = False
|
2010-11-05 09:15:31 -07:00
|
|
|
self.headers = kwargs.get('headers', {})
|
2010-10-29 15:26:35 -07:00
|
|
|
|
|
|
|
def getresponse(self):
|
|
|
|
if self.with_exc:
|
|
|
|
raise Exception('test')
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getheader(self, header):
|
|
|
|
return self.headers[header]
|
|
|
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
return pickle.dumps({})
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
return
|
|
|
|
return lambda *args, **kwargs: FakeConn(status, *args, **kwargs)
|
|
|
|
|
2020-04-03 10:44:25 -07:00
|
|
|
|
2010-10-29 15:26:35 -07:00
|
|
|
process_errors = []
|
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
class MockProcess(object):
|
|
|
|
ret_code = None
|
|
|
|
ret_log = None
|
2010-10-29 15:26:35 -07:00
|
|
|
check_args = None
|
2015-08-21 18:14:55 -07:00
|
|
|
captured_log = None
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
class Stream(object):
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
def read(self):
|
2015-06-15 22:10:45 +05:30
|
|
|
return next(MockProcess.ret_log)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
2015-06-15 22:10:45 +05:30
|
|
|
targs = next(MockProcess.check_args)
|
2010-10-29 15:26:35 -07:00
|
|
|
for targ in targs:
|
2014-05-29 00:54:07 -07:00
|
|
|
# Allow more than 2 candidate targs
|
|
|
|
# (e.g. a case that either node is fine when nodes shuffled)
|
|
|
|
if isinstance(targ, tuple):
|
|
|
|
allowed = False
|
|
|
|
for target in targ:
|
|
|
|
if target in args[0]:
|
|
|
|
allowed = True
|
|
|
|
if not allowed:
|
|
|
|
process_errors.append("Invalid: %s not in %s" % (targ,
|
|
|
|
args))
|
|
|
|
else:
|
|
|
|
if targ not in args[0]:
|
|
|
|
process_errors.append("Invalid: %s not in %s" % (targ,
|
|
|
|
args))
|
2015-08-21 18:14:55 -07:00
|
|
|
self.captured_info = {
|
|
|
|
'rsync_args': args[0],
|
|
|
|
}
|
2010-07-12 17:03:45 -05:00
|
|
|
self.stdout = self.Stream()
|
|
|
|
|
|
|
|
def wait(self):
|
2015-08-21 18:14:55 -07:00
|
|
|
# the _mock_process context manager assures this class attribute is a
|
|
|
|
# mutable list and takes care of resetting it
|
|
|
|
rv = next(self.ret_code)
|
|
|
|
if self.captured_log is not None:
|
|
|
|
self.captured_info['ret_code'] = rv
|
|
|
|
self.captured_log.append(self.captured_info)
|
|
|
|
return rv
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
@contextmanager
|
|
|
|
def _mock_process(ret):
|
2015-08-21 18:14:55 -07:00
|
|
|
captured_log = []
|
|
|
|
MockProcess.captured_log = captured_log
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_process = subprocess.Popen
|
|
|
|
MockProcess.ret_code = (i[0] for i in ret)
|
2019-07-15 18:07:51 -07:00
|
|
|
MockProcess.ret_log = (i[1] if six.PY2 else i[1].encode('utf8')
|
|
|
|
for i in ret)
|
2010-10-29 15:26:35 -07:00
|
|
|
MockProcess.check_args = (i[2] for i in ret)
|
2010-07-12 17:03:45 -05:00
|
|
|
object_replicator.subprocess.Popen = MockProcess
|
2015-08-21 18:14:55 -07:00
|
|
|
yield captured_log
|
|
|
|
MockProcess.captured_log = None
|
2010-07-12 17:03:45 -05:00
|
|
|
object_replicator.subprocess.Popen = orig_process
|
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2018-03-03 17:07:54 +00:00
|
|
|
class MockHungProcess(object):
|
2018-04-03 16:40:06 -07:00
|
|
|
def __init__(self, polls_needed=0, *args, **kwargs):
|
2018-03-03 17:07:54 +00:00
|
|
|
class MockStdout(object):
|
|
|
|
def read(self):
|
|
|
|
pass
|
|
|
|
self.stdout = MockStdout()
|
|
|
|
self._state = 'running'
|
|
|
|
self._calls = []
|
2018-04-03 16:40:06 -07:00
|
|
|
self._polls = 0
|
|
|
|
self._polls_needed = polls_needed
|
2018-03-03 17:07:54 +00:00
|
|
|
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
def wait(self, timeout=None):
|
2018-03-03 17:07:54 +00:00
|
|
|
self._calls.append(('wait', self._state))
|
|
|
|
if self._state == 'running':
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
# Sleep so we trip the rsync timeout
|
2018-03-03 17:07:54 +00:00
|
|
|
sleep(1)
|
|
|
|
raise BaseException('You need to mock out some timeouts')
|
2018-04-03 16:40:06 -07:00
|
|
|
if not self._polls_needed:
|
|
|
|
self._state = 'os-reaped'
|
|
|
|
return 137
|
|
|
|
if timeout is not None:
|
|
|
|
raise subprocess.TimeoutExpired('some cmd', timeout)
|
|
|
|
raise BaseException("You're waiting indefinitely on something "
|
|
|
|
"we've established is hung")
|
|
|
|
|
|
|
|
def poll(self):
|
|
|
|
self._calls.append(('poll', self._state))
|
|
|
|
self._polls += 1
|
|
|
|
if self._polls >= self._polls_needed:
|
|
|
|
self._state = 'os-reaped'
|
|
|
|
return 137
|
|
|
|
else:
|
|
|
|
return None
|
2018-03-03 17:07:54 +00:00
|
|
|
|
|
|
|
def terminate(self):
|
|
|
|
self._calls.append(('terminate', self._state))
|
|
|
|
if self._state == 'running':
|
|
|
|
self._state = 'terminating'
|
|
|
|
|
|
|
|
def kill(self):
|
|
|
|
self._calls.append(('kill', self._state))
|
|
|
|
self._state = 'killed'
|
|
|
|
|
|
|
|
|
2016-07-04 18:21:54 +02:00
|
|
|
def _create_test_rings(path, devs=None, next_part_power=None):
|
2010-07-12 17:03:45 -05:00
|
|
|
testgz = os.path.join(path, 'object.ring.gz')
|
|
|
|
intended_replica2part2dev_id = [
|
|
|
|
[0, 1, 2, 3, 4, 5, 6],
|
|
|
|
[1, 2, 3, 0, 5, 6, 4],
|
|
|
|
[2, 3, 0, 1, 6, 4, 5],
|
2013-07-22 15:27:54 -07:00
|
|
|
]
|
2015-02-26 16:37:16 -08:00
|
|
|
intended_devs = devs or [
|
2014-05-29 00:54:07 -07:00
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': '127.0.0.0', 'port': 6200},
|
2014-05-29 00:54:07 -07:00
|
|
|
{'id': 1, 'device': 'sda', 'zone': 1,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 2, 'ip': '127.0.0.1', 'port': 6200},
|
2014-05-29 00:54:07 -07:00
|
|
|
{'id': 2, 'device': 'sda', 'zone': 2,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 3, 'ip': '127.0.0.2', 'port': 6200},
|
2014-05-29 00:54:07 -07:00
|
|
|
{'id': 3, 'device': 'sda', 'zone': 4,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 2, 'ip': '127.0.0.3', 'port': 6200},
|
2014-05-29 00:54:07 -07:00
|
|
|
{'id': 4, 'device': 'sda', 'zone': 5,
|
2016-07-18 14:01:57 -07:00
|
|
|
'region': 1, 'ip': '127.0.0.4', 'port': 6200,
|
|
|
|
'replication_ip': '127.0.1.4'},
|
2012-06-04 13:27:39 +02:00
|
|
|
{'id': 5, 'device': 'sda', 'zone': 6,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 3, 'ip': 'fe80::202:b3ff:fe1e:8329', 'port': 6200},
|
2014-05-29 00:54:07 -07:00
|
|
|
{'id': 6, 'device': 'sda', 'zone': 7, 'region': 1,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '2001:0db8:85a3:0000:0000:8a2e:0370:7334', 'port': 6200},
|
2013-07-20 13:44:11 -07:00
|
|
|
]
|
2010-07-12 17:03:45 -05:00
|
|
|
intended_part_shift = 30
|
2013-07-20 13:44:11 -07:00
|
|
|
with closing(GzipFile(testgz, 'wb')) as f:
|
2013-08-31 23:42:43 -04:00
|
|
|
pickle.dump(
|
|
|
|
ring.RingData(intended_replica2part2dev_id,
|
2016-07-04 18:21:54 +02:00
|
|
|
intended_devs, intended_part_shift, next_part_power),
|
2013-07-20 13:44:11 -07:00
|
|
|
f)
|
2014-03-18 10:50:17 -07:00
|
|
|
|
|
|
|
testgz = os.path.join(path, 'object-1.ring.gz')
|
|
|
|
with closing(GzipFile(testgz, 'wb')) as f:
|
|
|
|
pickle.dump(
|
|
|
|
ring.RingData(intended_replica2part2dev_id,
|
2016-07-04 18:21:54 +02:00
|
|
|
intended_devs, intended_part_shift, next_part_power),
|
2014-03-18 10:50:17 -07:00
|
|
|
f)
|
2015-02-26 16:37:16 -08:00
|
|
|
for policy in POLICIES:
|
|
|
|
policy.object_ring = None # force reload
|
2014-03-18 10:50:17 -07:00
|
|
|
return
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
|
2014-03-18 10:50:17 -07:00
|
|
|
@patch_policies([StoragePolicy(0, 'zero', False),
|
|
|
|
StoragePolicy(1, 'one', True)])
|
2010-07-12 17:03:45 -05:00
|
|
|
class TestObjectReplicator(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
skip_if_no_xattrs()
|
2018-11-01 17:49:35 +00:00
|
|
|
utils.HASH_PATH_SUFFIX = b'endcap'
|
|
|
|
utils.HASH_PATH_PREFIX = b''
|
2015-11-06 10:49:09 +01:00
|
|
|
# recon cache path
|
|
|
|
self.recon_cache = tempfile.mkdtemp()
|
|
|
|
rmtree(self.recon_cache, ignore_errors=1)
|
|
|
|
os.mkdir(self.recon_cache)
|
2010-07-12 17:03:45 -05:00
|
|
|
# Setup a test ring (stolen from common/test_ring.py)
|
2010-11-16 11:06:39 -08:00
|
|
|
self.testdir = tempfile.mkdtemp()
|
2010-07-12 17:03:45 -05:00
|
|
|
self.devices = os.path.join(self.testdir, 'node')
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
os.mkdir(self.testdir)
|
|
|
|
os.mkdir(self.devices)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
|
|
|
|
self.objects, self.objects_1, self.parts, self.parts_1 = \
|
|
|
|
self._write_disk_data('sda')
|
2014-03-18 10:50:17 -07:00
|
|
|
_create_test_rings(self.testdir)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
self.logger = debug_logger('test-replicator')
|
2010-07-12 17:03:45 -05:00
|
|
|
self.conf = dict(
|
2016-02-01 18:06:54 +00:00
|
|
|
bind_ip=_ips()[0], bind_port=6200,
|
2010-07-12 17:03:45 -05:00
|
|
|
swift_dir=self.testdir, devices=self.devices, mount_check='false',
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
timeout='300', stats_interval='1', sync_method='rsync',
|
|
|
|
recon_cache_path=self.recon_cache)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
self._create_replicator()
|
2015-08-21 18:14:55 -07:00
|
|
|
self.ts = make_timestamp_iter()
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2010-10-29 15:26:35 -07:00
|
|
|
def tearDown(self):
|
2015-08-21 18:14:55 -07:00
|
|
|
self.assertFalse(process_errors)
|
2010-10-29 15:26:35 -07:00
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
2015-11-06 10:49:09 +01:00
|
|
|
rmtree(self.recon_cache, ignore_errors=1)
|
2010-10-29 15:26:35 -07:00
|
|
|
|
2015-07-14 10:03:18 +05:30
|
|
|
def test_handoff_replication_setting_warnings(self):
|
2015-08-21 14:04:46 -07:00
|
|
|
conf_tests = [
|
|
|
|
# (config, expected_warning)
|
|
|
|
({}, False),
|
|
|
|
({'handoff_delete': 'auto'}, False),
|
|
|
|
({'handoffs_first': 'no'}, False),
|
|
|
|
({'handoff_delete': '2'}, True),
|
|
|
|
({'handoffs_first': 'yes'}, True),
|
|
|
|
({'handoff_delete': '1', 'handoffs_first': 'yes'}, True),
|
|
|
|
]
|
|
|
|
log_message = 'Handoff only mode is not intended for normal ' \
|
|
|
|
'operation, please disable handoffs_first and ' \
|
|
|
|
'handoff_delete before the next normal rebalance'
|
|
|
|
for config, expected_warning in conf_tests:
|
|
|
|
self.logger.clear()
|
|
|
|
object_replicator.ObjectReplicator(config, logger=self.logger)
|
|
|
|
warning_log_lines = self.logger.get_lines_for_level('warning')
|
|
|
|
if expected_warning:
|
|
|
|
expected_log_lines = [log_message]
|
|
|
|
else:
|
|
|
|
expected_log_lines = []
|
|
|
|
self.assertEqual(expected_log_lines, warning_log_lines,
|
|
|
|
'expected %s != %s for config %r' % (
|
|
|
|
expected_log_lines,
|
|
|
|
warning_log_lines,
|
|
|
|
config,
|
|
|
|
))
|
2015-07-14 10:03:18 +05:30
|
|
|
|
2016-08-11 00:53:13 +08:00
|
|
|
def _write_disk_data(self, disk_name, with_json=False):
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
os.mkdir(os.path.join(self.devices, disk_name))
|
|
|
|
objects = os.path.join(self.devices, disk_name,
|
|
|
|
diskfile.get_data_dir(POLICIES[0]))
|
|
|
|
objects_1 = os.path.join(self.devices, disk_name,
|
|
|
|
diskfile.get_data_dir(POLICIES[1]))
|
|
|
|
os.mkdir(objects)
|
|
|
|
os.mkdir(objects_1)
|
|
|
|
parts = {}
|
|
|
|
parts_1 = {}
|
|
|
|
for part in ['0', '1', '2', '3']:
|
|
|
|
parts[part] = os.path.join(objects, part)
|
|
|
|
os.mkdir(parts[part])
|
|
|
|
parts_1[part] = os.path.join(objects_1, part)
|
|
|
|
os.mkdir(parts_1[part])
|
|
|
|
|
2016-08-11 00:53:13 +08:00
|
|
|
if with_json:
|
|
|
|
for json_file in ['auditor_status_ZBF.json',
|
|
|
|
'auditor_status_ALL.json']:
|
|
|
|
for obj_dir in [objects, objects_1]:
|
|
|
|
with open(os.path.join(obj_dir, json_file), 'w'):
|
|
|
|
pass
|
|
|
|
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
return objects, objects_1, parts, parts_1
|
|
|
|
|
|
|
|
def _create_replicator(self):
|
|
|
|
self.replicator = object_replicator.ObjectReplicator(self.conf)
|
|
|
|
self.replicator.logger = self.logger
|
Enable Object Replicator's failure count in recon
This patch makes the count of object replication failure in recon.
And "failure_nodes" is added to Account Replicator and
Container Replicator.
Recon shows the count of object repliction failure as follows:
$ curl http://<ip>:<port>/recon/replication/object
{
"replication_last": 1416334368.60865,
"replication_stats": {
"attempted": 13346,
"failure": 870,
"failure_nodes": {
"192.168.0.1": {"sdb1": 3},
"192.168.0.2": {"sdb1": 851,
"sdc1": 1,
"sdd1": 8},
"192.168.0.3": {"sdb1": 3,
"sdc1": 4}
},
"hashmatch": 0,
"remove": 0,
"rsync": 0,
"start": 1416354240.9761429,
"success": 1908
},
"replication_time": 2316.5563162644703,
"object_replication_last": 1416334368.60865,
"object_replication_time": 2316.5563162644703
}
Note that 'object_replication_last' and 'object_replication_time' are
considered to be transitional and will be removed in the subsequent
releases. Use 'replication_last' and 'replication_time' instead.
Additionaly this patch adds the count in swift-recon and it will be
showed as follows:
$ swift-recon object -r
========================================================================
=======
--> Starting reconnaissance on 4 hosts
========================================================================
=======
[2014-11-27 16:14:09] Checking on replication
[replication_failure] low: 0, high: 0, avg: 0.0, total: 0, Failed: 0.0%,
no_result: 0, reported: 4
[replication_success] low: 3, high: 3, avg: 3.0, total: 12,
Failed: 0.0%, no_result: 0, reported: 4
[replication_time] low: 0, high: 0, avg: 0.0, total: 0, Failed: 0.0%,
no_result: 0, reported: 4
[replication_attempted] low: 1, high: 1, avg: 1.0, total: 4,
Failed: 0.0%, no_result: 0, reported: 4
Oldest completion was 2014-11-27 16:09:45 (4 minutes ago) by
192.168.0.4:6002.
Most recent completion was 2014-11-27 16:14:19 (-10 seconds ago) by
192.168.0.1:6002.
========================================================================
=======
In case there is a cluster which has servers, a server runs with this
patch and the other servers run without this patch. If swift-recon
executes on the server which runs with this patch, there are unnecessary
information on the output such as [failure], [success] and [attempted].
Because other servers which run without this patch are not able to
send a response with information that this patch needs.
Therefore once you apply this patch, you also apply this patch to other
servers before you execute swift-recon.
DocImpact
Change-Id: Iecd33655ae2568482833131f422679996c374d78
Co-Authored-By: Kenichiro Matsuda <matsuda_kenichi@jp.fujitsu.com>
Co-Authored-By: Brian Cline <bcline@softlayer.com>
Implements: blueprint enable-object-replication-failure-in-recon
2014-12-03 06:15:16 +09:00
|
|
|
self.replicator._zero_stats()
|
|
|
|
self.replicator.all_devs_info = set()
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
self.df_mgr = diskfile.DiskFileManager(self.conf, self.logger)
|
|
|
|
|
2015-11-06 10:49:09 +01:00
|
|
|
def test_run_once_no_local_device_in_ring(self):
|
|
|
|
conf = dict(swift_dir=self.testdir, devices=self.devices,
|
|
|
|
bind_ip='1.1.1.1', recon_cache_path=self.recon_cache,
|
|
|
|
mount_check='false', timeout='300', stats_interval='1')
|
|
|
|
replicator = object_replicator.ObjectReplicator(conf,
|
|
|
|
logger=self.logger)
|
|
|
|
replicator.run_once()
|
|
|
|
expected = [
|
2016-05-03 14:33:05 +02:00
|
|
|
"Can't find itself in policy with index 0 with ips 1.1.1.1 and"
|
|
|
|
" with port 6200 in ring file, not replicating",
|
|
|
|
"Can't find itself in policy with index 1 with ips 1.1.1.1 and"
|
|
|
|
" with port 6200 in ring file, not replicating",
|
2015-11-06 10:49:09 +01:00
|
|
|
]
|
|
|
|
self.assertEqual(expected, self.logger.get_lines_for_level('error'))
|
|
|
|
|
2010-10-29 15:26:35 -07:00
|
|
|
def test_run_once(self):
|
2014-03-18 10:50:17 -07:00
|
|
|
conf = dict(swift_dir=self.testdir, devices=self.devices,
|
2015-11-06 10:49:09 +01:00
|
|
|
bind_ip=_ips()[0], recon_cache_path=self.recon_cache,
|
2014-03-18 10:50:17 -07:00
|
|
|
mount_check='false', timeout='300', stats_interval='1')
|
2015-11-06 10:49:09 +01:00
|
|
|
replicator = object_replicator.ObjectReplicator(conf,
|
|
|
|
logger=self.logger)
|
2010-11-03 16:08:13 -07:00
|
|
|
was_connector = object_replicator.http_connect
|
2010-10-29 15:26:35 -07:00
|
|
|
object_replicator.http_connect = mock_http_connect(200)
|
|
|
|
cur_part = '0'
|
2014-03-18 10:50:17 -07:00
|
|
|
df = self.df_mgr.get_diskfile('sda', cur_part, 'a', 'c', 'o',
|
2015-03-17 08:32:57 +00:00
|
|
|
policy=POLICIES[0])
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
2010-10-29 15:26:35 -07:00
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2010-10-29 15:26:35 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, cur_part, data_dir)
|
|
|
|
process_arg_checker = []
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = replicator.load_object_ring(POLICIES[0])
|
2010-10-29 15:26:35 -07:00
|
|
|
nodes = [node for node in
|
2014-03-18 10:50:17 -07:00
|
|
|
ring.get_part_nodes(int(cur_part))
|
2013-08-31 23:42:43 -04:00
|
|
|
if node['ip'] not in _ips()]
|
2014-05-29 00:54:07 -07:00
|
|
|
rsync_mods = tuple(['%s::object/sda/objects/%s' %
|
|
|
|
(node['ip'], cur_part) for node in nodes])
|
2010-10-29 15:26:35 -07:00
|
|
|
for node in nodes:
|
2010-11-03 16:08:13 -07:00
|
|
|
process_arg_checker.append(
|
2014-05-29 00:54:07 -07:00
|
|
|
(0, '', ['rsync', whole_path_from, rsync_mods]))
|
2016-11-25 09:26:27 +01:00
|
|
|
start = replicator.replication_cycle
|
|
|
|
self.assertGreaterEqual(start, 0)
|
2017-02-06 04:08:40 -08:00
|
|
|
self.assertLessEqual(start, 9)
|
2010-10-29 15:26:35 -07:00
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
replicator.run_once()
|
2017-02-06 04:08:40 -08:00
|
|
|
self.assertEqual((start + 1) % 10, replicator.replication_cycle)
|
2010-10-29 15:26:35 -07:00
|
|
|
self.assertFalse(process_errors)
|
2015-11-06 10:49:09 +01:00
|
|
|
self.assertFalse(self.logger.get_lines_for_level('error'))
|
2016-02-29 13:14:56 +00:00
|
|
|
|
|
|
|
# Returns 0 at first, and 60 on all following .next() calls
|
|
|
|
def _infinite_gen():
|
|
|
|
yield 0
|
|
|
|
while True:
|
|
|
|
yield 60
|
|
|
|
|
2017-07-11 10:41:35 +01:00
|
|
|
for cycle in range(1, 10):
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
with mock.patch('time.time', side_effect=_infinite_gen()):
|
2016-02-29 13:14:56 +00:00
|
|
|
replicator.run_once()
|
|
|
|
self.assertEqual((start + 1 + cycle) % 10,
|
|
|
|
replicator.replication_cycle)
|
|
|
|
|
|
|
|
recon_fname = os.path.join(self.recon_cache, "object.recon")
|
|
|
|
with open(recon_fname) as cachefile:
|
|
|
|
recon = json.loads(cachefile.read())
|
|
|
|
self.assertEqual(1, recon.get('replication_time'))
|
|
|
|
self.assertIn('replication_stats', recon)
|
|
|
|
self.assertIn('replication_last', recon)
|
|
|
|
expected = 'Object replication complete (once). (1.00 minutes)'
|
|
|
|
self.assertIn(expected, self.logger.get_lines_for_level('info'))
|
2017-07-11 10:41:35 +01:00
|
|
|
self.assertFalse(self.logger.get_lines_for_level('error'))
|
2016-02-29 13:14:56 +00:00
|
|
|
object_replicator.http_connect = was_connector
|
2010-10-29 15:26:35 -07:00
|
|
|
|
2014-03-18 10:50:17 -07:00
|
|
|
# policy 1
|
|
|
|
def test_run_once_1(self):
|
|
|
|
conf = dict(swift_dir=self.testdir, devices=self.devices,
|
2015-11-06 10:49:09 +01:00
|
|
|
recon_cache_path=self.recon_cache,
|
2014-03-18 10:50:17 -07:00
|
|
|
mount_check='false', timeout='300', stats_interval='1')
|
2015-11-06 10:49:09 +01:00
|
|
|
replicator = object_replicator.ObjectReplicator(conf,
|
|
|
|
logger=self.logger)
|
2014-03-18 10:50:17 -07:00
|
|
|
was_connector = object_replicator.http_connect
|
|
|
|
object_replicator.http_connect = mock_http_connect(200)
|
|
|
|
cur_part = '0'
|
|
|
|
df = self.df_mgr.get_diskfile('sda', cur_part, 'a', 'c', 'o',
|
2015-03-17 08:32:57 +00:00
|
|
|
policy=POLICIES[1])
|
2014-03-18 10:50:17 -07:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2014-03-18 10:50:17 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects_1, cur_part, data_dir)
|
|
|
|
process_arg_checker = []
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = replicator.load_object_ring(POLICIES[1])
|
2014-03-18 10:50:17 -07:00
|
|
|
nodes = [node for node in
|
|
|
|
ring.get_part_nodes(int(cur_part))
|
|
|
|
if node['ip'] not in _ips()]
|
2014-05-29 00:54:07 -07:00
|
|
|
rsync_mods = tuple(['%s::object/sda/objects-1/%s' %
|
|
|
|
(node['ip'], cur_part) for node in nodes])
|
2014-03-18 10:50:17 -07:00
|
|
|
for node in nodes:
|
|
|
|
process_arg_checker.append(
|
2014-05-29 00:54:07 -07:00
|
|
|
(0, '', ['rsync', whole_path_from, rsync_mods]))
|
2014-03-18 10:50:17 -07:00
|
|
|
with _mock_process(process_arg_checker):
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
with mock.patch('swift.obj.replicator.whataremyips',
|
|
|
|
side_effect=_ips):
|
|
|
|
replicator.run_once()
|
2014-03-18 10:50:17 -07:00
|
|
|
self.assertFalse(process_errors)
|
2015-11-06 10:49:09 +01:00
|
|
|
self.assertFalse(self.logger.get_lines_for_level('error'))
|
2010-11-03 16:08:13 -07:00
|
|
|
object_replicator.http_connect = was_connector
|
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
def test_check_ring(self):
|
2014-03-18 10:50:17 -07:00
|
|
|
for pol in POLICIES:
|
2015-03-17 08:32:57 +00:00
|
|
|
obj_ring = self.replicator.load_object_ring(pol)
|
2014-03-18 10:50:17 -07:00
|
|
|
self.assertTrue(self.replicator.check_ring(obj_ring))
|
|
|
|
orig_check = self.replicator.next_check
|
|
|
|
self.replicator.next_check = orig_check - 30
|
|
|
|
self.assertTrue(self.replicator.check_ring(obj_ring))
|
|
|
|
self.replicator.next_check = orig_check
|
|
|
|
orig_ring_time = obj_ring._mtime
|
|
|
|
obj_ring._mtime = orig_ring_time - 30
|
|
|
|
self.assertTrue(self.replicator.check_ring(obj_ring))
|
|
|
|
self.replicator.next_check = orig_check - 30
|
|
|
|
self.assertFalse(self.replicator.check_ring(obj_ring))
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2013-01-12 07:25:15 +00:00
|
|
|
def test_collect_jobs_mkdirs_error(self):
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
non_local = {}
|
|
|
|
|
2013-01-12 07:25:15 +00:00
|
|
|
def blowup_mkdirs(path):
|
2015-03-31 22:35:37 -07:00
|
|
|
non_local['path'] = path
|
2013-01-12 07:25:15 +00:00
|
|
|
raise OSError('Ow!')
|
|
|
|
|
2015-01-22 15:26:19 -08:00
|
|
|
with mock.patch.object(object_replicator, 'mkdirs', blowup_mkdirs):
|
2013-01-12 07:25:15 +00:00
|
|
|
rmtree(self.objects, ignore_errors=1)
|
|
|
|
object_replicator.mkdirs = blowup_mkdirs
|
2013-03-26 20:42:26 +00:00
|
|
|
self.replicator.collect_jobs()
|
2015-03-31 22:35:37 -07:00
|
|
|
self.assertEqual(self.logger.get_lines_for_level('error'), [
|
|
|
|
'ERROR creating %s: ' % non_local['path']])
|
|
|
|
log_args, log_kwargs = self.logger.log_dict['error'][0]
|
|
|
|
self.assertEqual(str(log_kwargs['exc_info'][1]), 'Ow!')
|
2013-01-12 07:25:15 +00:00
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
def test_collect_jobs(self):
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
jobs_to_delete = [j for j in jobs if j['delete']]
|
2014-03-18 10:50:17 -07:00
|
|
|
jobs_by_pol_part = {}
|
2010-11-05 09:15:31 -07:00
|
|
|
for job in jobs:
|
2015-03-17 08:32:57 +00:00
|
|
|
jobs_by_pol_part[str(int(job['policy'])) + job['partition']] = job
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(jobs_to_delete), 2)
|
2019-07-03 16:41:38 +08:00
|
|
|
self.assertEqual('1', jobs_to_delete[0]['partition'])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['00']['nodes']], [1, 2])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['01']['nodes']],
|
|
|
|
[1, 2, 3])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['02']['nodes']], [2, 3])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['03']['nodes']], [3, 1])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['10']['nodes']], [1, 2])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['11']['nodes']],
|
|
|
|
[1, 2, 3])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['12']['nodes']], [2, 3])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(
|
2014-03-18 10:50:17 -07:00
|
|
|
[node['id'] for node in jobs_by_pol_part['13']['nodes']], [3, 1])
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['00', '01', '02', '03']:
|
2014-03-18 10:50:17 -07:00
|
|
|
for node in jobs_by_pol_part[part]['nodes']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(node['device'], 'sda')
|
|
|
|
self.assertEqual(jobs_by_pol_part[part]['path'],
|
|
|
|
os.path.join(self.objects, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['10', '11', '12', '13']:
|
2014-03-18 10:50:17 -07:00
|
|
|
for node in jobs_by_pol_part[part]['nodes']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(node['device'], 'sda')
|
|
|
|
self.assertEqual(jobs_by_pol_part[part]['path'],
|
|
|
|
os.path.join(self.objects_1, part[1:]))
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2017-09-01 14:15:45 -07:00
|
|
|
def test_collect_jobs_unmounted(self):
|
|
|
|
with mock_check_drive() as mocks:
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
self.assertEqual(jobs, [])
|
|
|
|
self.assertEqual(mocks['ismount'].mock_calls, [])
|
|
|
|
self.assertEqual(len(mocks['isdir'].mock_calls), 2)
|
|
|
|
|
|
|
|
self.replicator.mount_check = True
|
|
|
|
with mock_check_drive() as mocks:
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
self.assertEqual(jobs, [])
|
|
|
|
self.assertEqual(mocks['isdir'].mock_calls, [])
|
|
|
|
self.assertEqual(len(mocks['ismount'].mock_calls), 2)
|
|
|
|
|
2016-08-11 00:53:13 +08:00
|
|
|
def test_collect_jobs_failure_report_with_auditor_stats_json(self):
|
|
|
|
devs = [
|
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
|
|
|
{'id': 1, 'device': 'sdb', 'zone': 1,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
|
|
|
{'id': 2, 'device': 'sdc', 'zone': 2,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6200},
|
|
|
|
{'id': 3, 'device': 'sdd', 'zone': 3,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6200},
|
|
|
|
]
|
|
|
|
objects_sdb, objects_1_sdb, _, _ = \
|
|
|
|
self._write_disk_data('sdb', with_json=True)
|
|
|
|
objects_sdc, objects_1_sdc, _, _ = \
|
|
|
|
self._write_disk_data('sdc', with_json=True)
|
|
|
|
objects_sdd, objects_1_sdd, _, _ = \
|
|
|
|
self._write_disk_data('sdd', with_json=True)
|
|
|
|
_create_test_rings(self.testdir, devs)
|
|
|
|
|
2019-04-09 14:15:32 -05:00
|
|
|
self.replicator.collect_jobs(override_partitions=[1])
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.assertEqual(self.replicator.total_stats.failure, 0)
|
2016-08-11 00:53:13 +08:00
|
|
|
|
2019-04-09 14:15:32 -05:00
|
|
|
def test_collect_jobs_with_override_parts_and_unexpected_part_dir(self):
|
|
|
|
self.replicator.collect_jobs(override_partitions=[0, 2])
|
|
|
|
self.assertEqual(self.replicator.total_stats.failure, 0)
|
|
|
|
os.mkdir(os.path.join(self.objects_1, 'foo'))
|
|
|
|
jobs = self.replicator.collect_jobs(override_partitions=[0, 2])
|
|
|
|
found_jobs = set()
|
|
|
|
for j in jobs:
|
|
|
|
found_jobs.add((int(j['policy']), int(j['partition'])))
|
|
|
|
self.assertEqual(found_jobs, {
|
|
|
|
(0, 0),
|
|
|
|
(0, 2),
|
|
|
|
(1, 0),
|
|
|
|
(1, 2),
|
|
|
|
})
|
|
|
|
num_disks = len(POLICIES[1].object_ring.devs)
|
|
|
|
# N.B. it's not clear why the UUT increments failure per device
|
|
|
|
self.assertEqual(self.replicator.total_stats.failure, num_disks)
|
|
|
|
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
@mock.patch('swift.obj.replicator.random.shuffle', side_effect=lambda l: l)
|
|
|
|
def test_collect_jobs_multi_disk(self, mock_shuffle):
|
|
|
|
devs = [
|
|
|
|
# Two disks on same IP/port
|
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
{'id': 1, 'device': 'sdb', 'zone': 1,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
# Two disks on same server, different ports
|
|
|
|
{'id': 2, 'device': 'sdc', 'zone': 2,
|
|
|
|
'region': 2, 'ip': '1.1.1.2', 'port': 1112,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
{'id': 3, 'device': 'sdd', 'zone': 4,
|
|
|
|
'region': 2, 'ip': '1.1.1.2', 'port': 1112,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6201},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
]
|
|
|
|
objects_sdb, objects_1_sdb, _, _ = self._write_disk_data('sdb')
|
|
|
|
objects_sdc, objects_1_sdc, _, _ = self._write_disk_data('sdc')
|
|
|
|
objects_sdd, objects_1_sdd, _, _ = self._write_disk_data('sdd')
|
|
|
|
_create_test_rings(self.testdir, devs)
|
|
|
|
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
|
|
|
|
self.assertEqual([mock.call(jobs)], mock_shuffle.mock_calls)
|
|
|
|
|
|
|
|
jobs_to_delete = [j for j in jobs if j['delete']]
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(jobs_to_delete), 4)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
self.assertEqual([
|
|
|
|
'1', '2', # policy 0; 1 not on sda, 2 not on sdb
|
|
|
|
'1', '2', # policy 1; 1 not on sda, 2 not on sdb
|
|
|
|
], [j['partition'] for j in jobs_to_delete])
|
|
|
|
|
|
|
|
jobs_by_pol_part_dev = {}
|
|
|
|
for job in jobs:
|
|
|
|
# There should be no jobs with a device not in just sda & sdb
|
|
|
|
self.assertTrue(job['device'] in ('sda', 'sdb'))
|
|
|
|
jobs_by_pol_part_dev[
|
|
|
|
str(int(job['policy'])) + job['partition'] + job['device']
|
|
|
|
] = job
|
|
|
|
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['00sda']['nodes']],
|
|
|
|
[1, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['00sdb']['nodes']],
|
|
|
|
[0, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['01sda']['nodes']],
|
|
|
|
[1, 2, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['01sdb']['nodes']],
|
|
|
|
[2, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['02sda']['nodes']],
|
|
|
|
[2, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['02sdb']['nodes']],
|
|
|
|
[2, 3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['03sda']['nodes']],
|
|
|
|
[3, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['03sdb']['nodes']],
|
|
|
|
[3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['10sda']['nodes']],
|
|
|
|
[1, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['10sdb']['nodes']],
|
|
|
|
[0, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['11sda']['nodes']],
|
|
|
|
[1, 2, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['11sdb']['nodes']],
|
|
|
|
[2, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['12sda']['nodes']],
|
|
|
|
[2, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['12sdb']['nodes']],
|
|
|
|
[2, 3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['13sda']['nodes']],
|
|
|
|
[3, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['13sdb']['nodes']],
|
|
|
|
[3, 0])
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['00', '01', '02', '03']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sda']['path'],
|
|
|
|
os.path.join(self.objects, part[1:]))
|
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdb']['path'],
|
|
|
|
os.path.join(objects_sdb, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['10', '11', '12', '13']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sda']['path'],
|
|
|
|
os.path.join(self.objects_1, part[1:]))
|
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdb']['path'],
|
|
|
|
os.path.join(objects_1_sdb, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
|
|
|
|
@mock.patch('swift.obj.replicator.random.shuffle', side_effect=lambda l: l)
|
|
|
|
def test_collect_jobs_multi_disk_diff_ports_normal(self, mock_shuffle):
|
|
|
|
# Normally (servers_per_port=0), replication_ip AND replication_port
|
|
|
|
# are used to determine local ring device entries. Here we show that
|
2016-02-01 18:06:54 +00:00
|
|
|
# with bind_ip='127.0.0.1', bind_port=6200, only "sdc" is local.
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
devs = [
|
|
|
|
# Two disks on same IP/port
|
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
{'id': 1, 'device': 'sdb', 'zone': 1,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
# Two disks on same server, different ports
|
|
|
|
{'id': 2, 'device': 'sdc', 'zone': 2,
|
|
|
|
'region': 2, 'ip': '1.1.1.2', 'port': 1112,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
{'id': 3, 'device': 'sdd', 'zone': 4,
|
|
|
|
'region': 2, 'ip': '1.1.1.2', 'port': 1112,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6201},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
]
|
|
|
|
objects_sdb, objects_1_sdb, _, _ = self._write_disk_data('sdb')
|
|
|
|
objects_sdc, objects_1_sdc, _, _ = self._write_disk_data('sdc')
|
|
|
|
objects_sdd, objects_1_sdd, _, _ = self._write_disk_data('sdd')
|
|
|
|
_create_test_rings(self.testdir, devs)
|
|
|
|
|
|
|
|
self.conf['bind_ip'] = '127.0.0.1'
|
|
|
|
self._create_replicator()
|
|
|
|
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
|
|
|
|
self.assertEqual([mock.call(jobs)], mock_shuffle.mock_calls)
|
|
|
|
|
|
|
|
jobs_to_delete = [j for j in jobs if j['delete']]
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(jobs_to_delete), 2)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
self.assertEqual([
|
|
|
|
'3', # policy 0; 3 not on sdc
|
|
|
|
'3', # policy 1; 3 not on sdc
|
|
|
|
], [j['partition'] for j in jobs_to_delete])
|
|
|
|
|
|
|
|
jobs_by_pol_part_dev = {}
|
|
|
|
for job in jobs:
|
|
|
|
# There should be no jobs with a device not sdc
|
|
|
|
self.assertEqual(job['device'], 'sdc')
|
|
|
|
jobs_by_pol_part_dev[
|
|
|
|
str(int(job['policy'])) + job['partition'] + job['device']
|
|
|
|
] = job
|
|
|
|
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['00sdc']['nodes']],
|
|
|
|
[0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['01sdc']['nodes']],
|
|
|
|
[1, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['02sdc']['nodes']],
|
|
|
|
[3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['03sdc']['nodes']],
|
|
|
|
[3, 0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['10sdc']['nodes']],
|
|
|
|
[0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['11sdc']['nodes']],
|
|
|
|
[1, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['12sdc']['nodes']],
|
|
|
|
[3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['13sdc']['nodes']],
|
|
|
|
[3, 0, 1])
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['00', '01', '02', '03']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdc']['path'],
|
|
|
|
os.path.join(objects_sdc, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['10', '11', '12', '13']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdc']['path'],
|
|
|
|
os.path.join(objects_1_sdc, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
|
|
|
|
@mock.patch('swift.obj.replicator.random.shuffle', side_effect=lambda l: l)
|
|
|
|
def test_collect_jobs_multi_disk_servers_per_port(self, mock_shuffle):
|
|
|
|
# Normally (servers_per_port=0), replication_ip AND replication_port
|
|
|
|
# are used to determine local ring device entries. Here we show that
|
2016-02-01 18:06:54 +00:00
|
|
|
# with servers_per_port > 0 and bind_ip='127.0.0.1', bind_port=6200,
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
# then both "sdc" and "sdd" are local.
|
|
|
|
devs = [
|
|
|
|
# Two disks on same IP/port
|
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
{'id': 1, 'device': 'sdb', 'zone': 1,
|
|
|
|
'region': 1, 'ip': '1.1.1.1', 'port': 1111,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.0', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
# Two disks on same server, different ports
|
|
|
|
{'id': 2, 'device': 'sdc', 'zone': 2,
|
|
|
|
'region': 2, 'ip': '1.1.1.2', 'port': 1112,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6200},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
{'id': 3, 'device': 'sdd', 'zone': 4,
|
|
|
|
'region': 2, 'ip': '1.1.1.2', 'port': 1112,
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_ip': '127.0.0.1', 'replication_port': 6201},
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
]
|
|
|
|
objects_sdb, objects_1_sdb, _, _ = self._write_disk_data('sdb')
|
|
|
|
objects_sdc, objects_1_sdc, _, _ = self._write_disk_data('sdc')
|
|
|
|
objects_sdd, objects_1_sdd, _, _ = self._write_disk_data('sdd')
|
|
|
|
_create_test_rings(self.testdir, devs)
|
|
|
|
|
|
|
|
self.conf['bind_ip'] = '127.0.0.1'
|
|
|
|
self.conf['servers_per_port'] = 1 # diff port ok
|
|
|
|
self._create_replicator()
|
|
|
|
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
|
|
|
|
self.assertEqual([mock.call(jobs)], mock_shuffle.mock_calls)
|
|
|
|
|
|
|
|
jobs_to_delete = [j for j in jobs if j['delete']]
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(jobs_to_delete), 4)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
self.assertEqual([
|
|
|
|
'3', '0', # policy 0; 3 not on sdc, 0 not on sdd
|
|
|
|
'3', '0', # policy 1; 3 not on sdc, 0 not on sdd
|
|
|
|
], [j['partition'] for j in jobs_to_delete])
|
|
|
|
|
|
|
|
jobs_by_pol_part_dev = {}
|
|
|
|
for job in jobs:
|
|
|
|
# There should be no jobs with a device not in just sdc & sdd
|
|
|
|
self.assertTrue(job['device'] in ('sdc', 'sdd'))
|
|
|
|
jobs_by_pol_part_dev[
|
|
|
|
str(int(job['policy'])) + job['partition'] + job['device']
|
|
|
|
] = job
|
|
|
|
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['00sdc']['nodes']],
|
|
|
|
[0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['00sdd']['nodes']],
|
|
|
|
[0, 1, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['01sdc']['nodes']],
|
|
|
|
[1, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['01sdd']['nodes']],
|
|
|
|
[1, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['02sdc']['nodes']],
|
|
|
|
[3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['02sdd']['nodes']],
|
|
|
|
[2, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['03sdc']['nodes']],
|
|
|
|
[3, 0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['03sdd']['nodes']],
|
|
|
|
[0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['10sdc']['nodes']],
|
|
|
|
[0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['10sdd']['nodes']],
|
|
|
|
[0, 1, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['11sdc']['nodes']],
|
|
|
|
[1, 3])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['11sdd']['nodes']],
|
|
|
|
[1, 2])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['12sdc']['nodes']],
|
|
|
|
[3, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['12sdd']['nodes']],
|
|
|
|
[2, 0])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['13sdc']['nodes']],
|
|
|
|
[3, 0, 1])
|
|
|
|
self.assertEqual([node['id']
|
|
|
|
for node in jobs_by_pol_part_dev['13sdd']['nodes']],
|
|
|
|
[0, 1])
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['00', '01', '02', '03']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdc']['path'],
|
|
|
|
os.path.join(objects_sdc, part[1:]))
|
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdd']['path'],
|
|
|
|
os.path.join(objects_sdd, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
for part in ['10', '11', '12', '13']:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdc']['path'],
|
|
|
|
os.path.join(objects_1_sdc, part[1:]))
|
|
|
|
self.assertEqual(jobs_by_pol_part_dev[part + 'sdd']['path'],
|
|
|
|
os.path.join(objects_1_sdd, part[1:]))
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
|
2013-08-22 19:23:29 +00:00
|
|
|
def test_collect_jobs_handoffs_first(self):
|
|
|
|
self.replicator.handoffs_first = True
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
self.assertTrue(jobs[0]['delete'])
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual('1', jobs[0]['partition'])
|
2013-08-22 19:23:29 +00:00
|
|
|
|
2015-08-21 18:14:55 -07:00
|
|
|
def test_handoffs_first_mode_will_process_all_jobs_after_handoffs(self):
|
2018-02-09 12:20:59 +08:00
|
|
|
# make an object in the handoff & primary partition
|
2015-08-21 18:14:55 -07:00
|
|
|
expected_suffix_paths = []
|
|
|
|
for policy in POLICIES:
|
|
|
|
# primary
|
|
|
|
ts = next(self.ts)
|
|
|
|
df = self.df_mgr.get_diskfile('sda', '0', 'a', 'c', 'o', policy)
|
|
|
|
with df.create() as w:
|
2019-01-29 03:28:26 +01:00
|
|
|
w.write(b'asdf')
|
2015-08-21 18:14:55 -07:00
|
|
|
w.put({'X-Timestamp': ts.internal})
|
|
|
|
w.commit(ts)
|
|
|
|
expected_suffix_paths.append(os.path.dirname(df._datadir))
|
|
|
|
# handoff
|
|
|
|
ts = next(self.ts)
|
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o', policy)
|
|
|
|
with df.create() as w:
|
2019-01-29 03:28:26 +01:00
|
|
|
w.write(b'asdf')
|
2015-08-21 18:14:55 -07:00
|
|
|
w.put({'X-Timestamp': ts.internal})
|
|
|
|
w.commit(ts)
|
|
|
|
expected_suffix_paths.append(os.path.dirname(df._datadir))
|
|
|
|
|
|
|
|
# rsync will be called for all parts we created objects in
|
|
|
|
process_arg_checker = [
|
|
|
|
# (return_code, stdout, <each in capture rsync args>)
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []), # handoff job "first" policy
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []), # handoff job "second" policy
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []), # update job "first" policy
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []), # update job "second" policy
|
|
|
|
]
|
|
|
|
# each handoff partition node gets one replicate request for after
|
|
|
|
# rsync (2 * 3), each primary partition with objects gets two
|
|
|
|
# replicate requests (pre-flight and post sync) to each of each
|
|
|
|
# partners (2 * 2 * 2), the 2 remaining empty parts (2 & 3) get a
|
|
|
|
# pre-flight replicate request per node for each storage policy
|
|
|
|
# (2 * 2 * 2) - so 6 + 8 + 8 == 22
|
|
|
|
replicate_responses = [200] * 22
|
|
|
|
stub_body = pickle.dumps({})
|
|
|
|
with _mock_process(process_arg_checker) as rsync_log, \
|
|
|
|
mock.patch('swift.obj.replicator.whataremyips',
|
|
|
|
side_effect=_ips), \
|
|
|
|
mocked_http_conn(*replicate_responses,
|
|
|
|
body=stub_body) as conn_log:
|
|
|
|
self.replicator.handoffs_first = True
|
|
|
|
self.replicator.replicate()
|
|
|
|
# all jobs processed!
|
|
|
|
self.assertEqual(self.replicator.job_count,
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.replicator.total_stats.attempted)
|
2015-08-21 18:15:25 -07:00
|
|
|
self.assertFalse(self.replicator.handoffs_remaining)
|
2015-08-21 18:14:55 -07:00
|
|
|
|
|
|
|
# sanity, all the handoffs suffixes we filled in were rsync'd
|
|
|
|
found_rsync_suffix_paths = set()
|
|
|
|
for subprocess_info in rsync_log:
|
|
|
|
local_path, remote_path = subprocess_info['rsync_args'][-2:]
|
|
|
|
found_rsync_suffix_paths.add(local_path)
|
|
|
|
self.assertEqual(set(expected_suffix_paths), found_rsync_suffix_paths)
|
|
|
|
# sanity, all nodes got replicated
|
|
|
|
found_replicate_calls = defaultdict(int)
|
|
|
|
for req in conn_log.requests:
|
|
|
|
self.assertEqual(req['method'], 'REPLICATE')
|
|
|
|
found_replicate_key = (
|
|
|
|
int(req['headers']['X-Backend-Storage-Policy-Index']),
|
|
|
|
req['path'])
|
|
|
|
found_replicate_calls[found_replicate_key] += 1
|
|
|
|
expected_replicate_calls = {
|
|
|
|
(0, '/sda/1/a83'): 3,
|
|
|
|
(1, '/sda/1/a83'): 3,
|
|
|
|
(0, '/sda/0'): 2,
|
|
|
|
(0, '/sda/0/a83'): 2,
|
|
|
|
(1, '/sda/0'): 2,
|
|
|
|
(1, '/sda/0/a83'): 2,
|
|
|
|
(0, '/sda/2'): 2,
|
|
|
|
(1, '/sda/2'): 2,
|
|
|
|
(0, '/sda/3'): 2,
|
|
|
|
(1, '/sda/3'): 2,
|
|
|
|
}
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(dict(found_replicate_calls),
|
|
|
|
expected_replicate_calls)
|
2015-08-21 18:14:55 -07:00
|
|
|
|
2015-08-21 18:15:25 -07:00
|
|
|
def test_handoffs_first_mode_will_abort_if_handoffs_remaining(self):
|
|
|
|
# make an object in the handoff partition
|
|
|
|
handoff_suffix_paths = []
|
|
|
|
for policy in POLICIES:
|
|
|
|
ts = next(self.ts)
|
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o', policy)
|
|
|
|
with df.create() as w:
|
2019-01-29 03:28:26 +01:00
|
|
|
w.write(b'asdf')
|
2015-08-21 18:15:25 -07:00
|
|
|
w.put({'X-Timestamp': ts.internal})
|
|
|
|
w.commit(ts)
|
|
|
|
handoff_suffix_paths.append(os.path.dirname(df._datadir))
|
|
|
|
process_arg_checker = [
|
|
|
|
# (return_code, stdout, <each in capture rsync args>)
|
|
|
|
(0, '', []),
|
|
|
|
(1, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
]
|
|
|
|
stub_body = pickle.dumps({})
|
|
|
|
with _mock_process(process_arg_checker) as rsync_log, \
|
|
|
|
mock.patch('swift.obj.replicator.whataremyips',
|
|
|
|
side_effect=_ips), \
|
|
|
|
mocked_http_conn(*[200] * 5, body=stub_body) as conn_log:
|
|
|
|
self.replicator.handoffs_first = True
|
|
|
|
self.replicator.replicate()
|
|
|
|
# stopped after handoffs!
|
|
|
|
self.assertEqual(1, self.replicator.handoffs_remaining)
|
|
|
|
self.assertEqual(8, self.replicator.job_count)
|
2016-02-11 10:59:43 -08:00
|
|
|
# in addition to the two update_deleted jobs as many as "concurrency"
|
2015-08-21 18:15:25 -07:00
|
|
|
# jobs may have been spawned into the pool before the failed
|
|
|
|
# update_deleted job incremented handoffs_remaining and caused the
|
|
|
|
# handoffs_first check to abort the current pass
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.assertLessEqual(self.replicator.total_stats.attempted,
|
2015-08-21 18:15:25 -07:00
|
|
|
2 + self.replicator.concurrency)
|
|
|
|
|
|
|
|
# sanity, all the handoffs suffixes we filled in were rsync'd
|
|
|
|
found_rsync_suffix_paths = set()
|
|
|
|
expected_replicate_requests = set()
|
|
|
|
for subprocess_info in rsync_log:
|
|
|
|
local_path, remote_path = subprocess_info['rsync_args'][-2:]
|
|
|
|
found_rsync_suffix_paths.add(local_path)
|
|
|
|
if subprocess_info['ret_code'] == 0:
|
|
|
|
node_ip = remote_path.split(':', 1)[0]
|
|
|
|
expected_replicate_requests.add(node_ip)
|
|
|
|
self.assertEqual(set(handoff_suffix_paths), found_rsync_suffix_paths)
|
2016-02-11 10:59:43 -08:00
|
|
|
# sanity, all successful rsync nodes got REPLICATE requests
|
2015-08-21 18:15:25 -07:00
|
|
|
found_replicate_requests = set()
|
|
|
|
self.assertEqual(5, len(conn_log.requests))
|
|
|
|
for req in conn_log.requests:
|
|
|
|
self.assertEqual(req['method'], 'REPLICATE')
|
|
|
|
found_replicate_requests.add(req['ip'])
|
|
|
|
self.assertEqual(expected_replicate_requests,
|
|
|
|
found_replicate_requests)
|
|
|
|
|
|
|
|
# and at least one partition got removed!
|
|
|
|
remaining_policies = []
|
|
|
|
for path in handoff_suffix_paths:
|
|
|
|
if os.path.exists(path):
|
|
|
|
policy = diskfile.extract_policy(path)
|
|
|
|
remaining_policies.append(policy)
|
|
|
|
self.assertEqual(len(remaining_policies), 1)
|
|
|
|
remaining_policy = remaining_policies[0]
|
|
|
|
|
|
|
|
# try again but with handoff_delete allowing for a single failure
|
|
|
|
with _mock_process(process_arg_checker) as rsync_log, \
|
|
|
|
mock.patch('swift.obj.replicator.whataremyips',
|
|
|
|
side_effect=_ips), \
|
|
|
|
mocked_http_conn(*[200] * 14, body=stub_body) as conn_log:
|
|
|
|
self.replicator.handoff_delete = 2
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.replicator._zero_stats()
|
2015-08-21 18:15:25 -07:00
|
|
|
self.replicator.replicate()
|
|
|
|
# all jobs processed!
|
|
|
|
self.assertEqual(self.replicator.job_count,
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.replicator.total_stats.attempted)
|
2015-08-21 18:15:25 -07:00
|
|
|
self.assertFalse(self.replicator.handoffs_remaining)
|
|
|
|
# sanity, all parts got replicated
|
|
|
|
found_replicate_calls = defaultdict(int)
|
|
|
|
for req in conn_log.requests:
|
|
|
|
self.assertEqual(req['method'], 'REPLICATE')
|
|
|
|
found_replicate_key = (
|
|
|
|
int(req['headers']['X-Backend-Storage-Policy-Index']),
|
|
|
|
req['path'])
|
|
|
|
found_replicate_calls[found_replicate_key] += 1
|
|
|
|
expected_replicate_calls = {
|
|
|
|
(int(remaining_policy), '/sda/1/a83'): 2,
|
|
|
|
(0, '/sda/0'): 2,
|
|
|
|
(1, '/sda/0'): 2,
|
|
|
|
(0, '/sda/2'): 2,
|
|
|
|
(1, '/sda/2'): 2,
|
|
|
|
(0, '/sda/3'): 2,
|
|
|
|
(1, '/sda/3'): 2,
|
|
|
|
}
|
|
|
|
self.assertEqual(dict(found_replicate_calls),
|
|
|
|
expected_replicate_calls)
|
|
|
|
|
|
|
|
# and now all handoff partitions have been rebalanced away!
|
|
|
|
removed_paths = set()
|
|
|
|
for path in handoff_suffix_paths:
|
|
|
|
if not os.path.exists(path):
|
|
|
|
removed_paths.add(path)
|
|
|
|
self.assertEqual(removed_paths, set(handoff_suffix_paths))
|
|
|
|
|
Improve object-replicator startup time.
The object replicator checks each partition directory to ensure it's
really a directory and not a zero-byte file. This was happening in
collect_jobs(), which is the first thing that the object replicator
does.
The effect was that, at startup, the object-replicator process would
list each "objects" or "objects-N" directory on each object device,
then stat() every single thing in there. On devices with lots of
partitions on them, this makes the replicator take a long time before
it does anything useful.
If you have a cluster with a too-high part_power plus some failing
disks elsewhere, you can easily get thousands of partition directories
on each disk. If you've got 36 disks per node, that turns into a very
long wait for the object replicator to do anything. Worse yet, if you
add in a configuration management system that pushes new rings every
couple hours, the object replicator can spend the vast majority of its
time collecting jobs, then only spend a short time doing useful work
before the ring changes and it has to start all over again.
This commit moves the stat() call (os.path.isfile) to the loop that
processes jobs. In a complete pass, the total work done is about the
same, but the replicator starts doing useful work much sooner.
Change-Id: I5ed4cd09dde514ec7d1e74afe35feaab0cf28a10
2014-12-08 15:05:29 -08:00
|
|
|
def test_replicator_skips_bogus_partition_dirs(self):
|
|
|
|
# A directory in the wrong place shouldn't crash the replicator
|
|
|
|
rmtree(self.objects)
|
|
|
|
rmtree(self.objects_1)
|
|
|
|
os.mkdir(self.objects)
|
|
|
|
os.mkdir(self.objects_1)
|
|
|
|
|
|
|
|
os.mkdir(os.path.join(self.objects, "burrito"))
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
self.assertEqual(len(jobs), 0)
|
|
|
|
|
2016-03-14 16:52:50 -07:00
|
|
|
def test_replicator_skips_rsync_temp_files(self):
|
|
|
|
# the empty pre-setup dirs aren't that useful to us
|
|
|
|
device_path = os.path.join(self.devices, 'sda')
|
|
|
|
rmtree(device_path, ignore_errors=1)
|
|
|
|
os.mkdir(device_path)
|
|
|
|
# create a real data file to trigger rsync
|
|
|
|
df = self.df_mgr.get_diskfile('sda', '0', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
|
|
|
ts = next(self.ts)
|
|
|
|
with df.create() as w:
|
2019-01-29 03:28:26 +01:00
|
|
|
w.write(b'asdf')
|
2016-03-14 16:52:50 -07:00
|
|
|
w.put({'X-Timestamp': ts.internal})
|
|
|
|
w.commit(ts)
|
|
|
|
# pre-flight and post sync request for both other primaries
|
|
|
|
expected_replicate_requests = 4
|
|
|
|
process_arg_checker = [
|
|
|
|
# (return_code, stdout, <each in capture rsync args>)
|
|
|
|
(0, '', []),
|
|
|
|
(0, '', []),
|
|
|
|
]
|
|
|
|
stub_body = pickle.dumps({})
|
|
|
|
with _mock_process(process_arg_checker) as rsync_log, \
|
|
|
|
mock.patch('swift.obj.replicator.whataremyips',
|
|
|
|
side_effect=_ips), \
|
|
|
|
mocked_http_conn(*[200] * expected_replicate_requests,
|
|
|
|
body=stub_body) as conn_log:
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertEqual(['REPLICATE'] * expected_replicate_requests,
|
|
|
|
[r['method'] for r in conn_log.requests])
|
|
|
|
# expect one rsync to each other primary node
|
|
|
|
self.assertEqual(2, len(rsync_log))
|
|
|
|
expected = '--exclude=.*.[0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]' \
|
|
|
|
'[0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]'
|
|
|
|
for subprocess_info in rsync_log:
|
|
|
|
rsync_args = subprocess_info['rsync_args']
|
|
|
|
for arg in rsync_args:
|
|
|
|
if arg.startswith('--exclude'):
|
|
|
|
self.assertEqual(arg, expected)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
self.fail('Did not find --exclude argument in %r' %
|
|
|
|
rsync_args)
|
|
|
|
|
Improve object-replicator startup time.
The object replicator checks each partition directory to ensure it's
really a directory and not a zero-byte file. This was happening in
collect_jobs(), which is the first thing that the object replicator
does.
The effect was that, at startup, the object-replicator process would
list each "objects" or "objects-N" directory on each object device,
then stat() every single thing in there. On devices with lots of
partitions on them, this makes the replicator take a long time before
it does anything useful.
If you have a cluster with a too-high part_power plus some failing
disks elsewhere, you can easily get thousands of partition directories
on each disk. If you've got 36 disks per node, that turns into a very
long wait for the object replicator to do anything. Worse yet, if you
add in a configuration management system that pushes new rings every
couple hours, the object replicator can spend the vast majority of its
time collecting jobs, then only spend a short time doing useful work
before the ring changes and it has to start all over again.
This commit moves the stat() call (os.path.isfile) to the loop that
processes jobs. In a complete pass, the total work done is about the
same, but the replicator starts doing useful work much sooner.
Change-Id: I5ed4cd09dde514ec7d1e74afe35feaab0cf28a10
2014-12-08 15:05:29 -08:00
|
|
|
def test_replicator_removes_zbf(self):
|
|
|
|
# After running xfs_repair, a partition directory could become a
|
|
|
|
# zero-byte file. If this happens, the replicator should clean it
|
|
|
|
# up, log something, and move on to the next partition.
|
|
|
|
|
|
|
|
# Surprise! Partition dir 1 is actually a zero-byte file.
|
|
|
|
pol_0_part_1_path = os.path.join(self.objects, '1')
|
|
|
|
rmtree(pol_0_part_1_path)
|
|
|
|
with open(pol_0_part_1_path, 'w'):
|
2012-09-04 13:59:26 -07:00
|
|
|
pass
|
Improve object-replicator startup time.
The object replicator checks each partition directory to ensure it's
really a directory and not a zero-byte file. This was happening in
collect_jobs(), which is the first thing that the object replicator
does.
The effect was that, at startup, the object-replicator process would
list each "objects" or "objects-N" directory on each object device,
then stat() every single thing in there. On devices with lots of
partitions on them, this makes the replicator take a long time before
it does anything useful.
If you have a cluster with a too-high part_power plus some failing
disks elsewhere, you can easily get thousands of partition directories
on each disk. If you've got 36 disks per node, that turns into a very
long wait for the object replicator to do anything. Worse yet, if you
add in a configuration management system that pushes new rings every
couple hours, the object replicator can spend the vast majority of its
time collecting jobs, then only spend a short time doing useful work
before the ring changes and it has to start all over again.
This commit moves the stat() call (os.path.isfile) to the loop that
processes jobs. In a complete pass, the total work done is about the
same, but the replicator starts doing useful work much sooner.
Change-Id: I5ed4cd09dde514ec7d1e74afe35feaab0cf28a10
2014-12-08 15:05:29 -08:00
|
|
|
self.assertTrue(os.path.isfile(pol_0_part_1_path)) # sanity check
|
|
|
|
|
|
|
|
# Policy 1's partition dir 1 is also a zero-byte file.
|
|
|
|
pol_1_part_1_path = os.path.join(self.objects_1, '1')
|
|
|
|
rmtree(pol_1_part_1_path)
|
|
|
|
with open(pol_1_part_1_path, 'w'):
|
2014-03-18 10:50:17 -07:00
|
|
|
pass
|
Improve object-replicator startup time.
The object replicator checks each partition directory to ensure it's
really a directory and not a zero-byte file. This was happening in
collect_jobs(), which is the first thing that the object replicator
does.
The effect was that, at startup, the object-replicator process would
list each "objects" or "objects-N" directory on each object device,
then stat() every single thing in there. On devices with lots of
partitions on them, this makes the replicator take a long time before
it does anything useful.
If you have a cluster with a too-high part_power plus some failing
disks elsewhere, you can easily get thousands of partition directories
on each disk. If you've got 36 disks per node, that turns into a very
long wait for the object replicator to do anything. Worse yet, if you
add in a configuration management system that pushes new rings every
couple hours, the object replicator can spend the vast majority of its
time collecting jobs, then only spend a short time doing useful work
before the ring changes and it has to start all over again.
This commit moves the stat() call (os.path.isfile) to the loop that
processes jobs. In a complete pass, the total work done is about the
same, but the replicator starts doing useful work much sooner.
Change-Id: I5ed4cd09dde514ec7d1e74afe35feaab0cf28a10
2014-12-08 15:05:29 -08:00
|
|
|
self.assertTrue(os.path.isfile(pol_1_part_1_path)) # sanity check
|
|
|
|
|
|
|
|
# Don't delete things in collect_jobs(); all the stat() calls would
|
|
|
|
# make replicator startup really slow.
|
|
|
|
self.replicator.collect_jobs()
|
|
|
|
self.assertTrue(os.path.exists(pol_0_part_1_path))
|
|
|
|
self.assertTrue(os.path.exists(pol_1_part_1_path))
|
|
|
|
|
|
|
|
# After a replication pass, the files should be gone
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
self.replicator.run_once()
|
|
|
|
|
|
|
|
self.assertFalse(os.path.exists(pol_0_part_1_path))
|
|
|
|
self.assertFalse(os.path.exists(pol_1_part_1_path))
|
2015-03-31 22:35:37 -07:00
|
|
|
self.assertEqual(
|
|
|
|
sorted(self.logger.get_lines_for_level('warning')), [
|
|
|
|
('Removing partition directory which was a file: %s'
|
|
|
|
% pol_1_part_1_path),
|
|
|
|
('Removing partition directory which was a file: %s'
|
|
|
|
% pol_0_part_1_path),
|
|
|
|
])
|
2012-09-04 13:59:26 -07:00
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
def test_delete_partition(self):
|
2013-07-30 17:16:59 +02:00
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
2013-08-22 19:23:29 +00:00
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2013-08-22 19:23:29 +00:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, '1', data_dir)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[0])
|
2013-08-22 19:23:29 +00:00
|
|
|
nodes = [node for node in
|
2014-03-18 10:50:17 -07:00
|
|
|
ring.get_part_nodes(1)
|
2013-08-22 19:23:29 +00:00
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for node in nodes:
|
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'], 1)
|
|
|
|
process_arg_checker.append(
|
|
|
|
(0, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
|
2015-02-26 16:37:16 -08:00
|
|
|
def test_delete_partition_default_sync_method(self):
|
|
|
|
self.replicator.conf.pop('sync_method')
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2015-02-26 16:37:16 -08:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2015-02-26 16:37:16 -08:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, '1', data_dir)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[0])
|
2015-02-26 16:37:16 -08:00
|
|
|
nodes = [node for node in
|
|
|
|
ring.get_part_nodes(1)
|
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for node in nodes:
|
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'], 1)
|
|
|
|
process_arg_checker.append(
|
|
|
|
(0, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
def test_delete_partition_ssync_single_region(self):
|
|
|
|
devs = [
|
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': '127.0.0.0', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
{'id': 1, 'device': 'sda', 'zone': 1,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': '127.0.0.1', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
{'id': 2, 'device': 'sda', 'zone': 2,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': '127.0.0.2', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
{'id': 3, 'device': 'sda', 'zone': 4,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': '127.0.0.3', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
{'id': 4, 'device': 'sda', 'zone': 5,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': '127.0.0.4', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
{'id': 5, 'device': 'sda', 'zone': 6,
|
2016-02-01 18:06:54 +00:00
|
|
|
'region': 1, 'ip': 'fe80::202:b3ff:fe1e:8329', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
{'id': 6, 'device': 'sda', 'zone': 7, 'region': 1,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '2001:0db8:85a3:0000:0000:8a2e:0370:7334', 'port': 6200},
|
2015-02-26 16:37:16 -08:00
|
|
|
]
|
|
|
|
_create_test_rings(self.testdir, devs=devs)
|
|
|
|
self.conf['sync_method'] = 'ssync'
|
|
|
|
self.replicator = object_replicator.ObjectReplicator(self.conf)
|
|
|
|
self.replicator.logger = debug_logger()
|
Enable Object Replicator's failure count in recon
This patch makes the count of object replication failure in recon.
And "failure_nodes" is added to Account Replicator and
Container Replicator.
Recon shows the count of object repliction failure as follows:
$ curl http://<ip>:<port>/recon/replication/object
{
"replication_last": 1416334368.60865,
"replication_stats": {
"attempted": 13346,
"failure": 870,
"failure_nodes": {
"192.168.0.1": {"sdb1": 3},
"192.168.0.2": {"sdb1": 851,
"sdc1": 1,
"sdd1": 8},
"192.168.0.3": {"sdb1": 3,
"sdc1": 4}
},
"hashmatch": 0,
"remove": 0,
"rsync": 0,
"start": 1416354240.9761429,
"success": 1908
},
"replication_time": 2316.5563162644703,
"object_replication_last": 1416334368.60865,
"object_replication_time": 2316.5563162644703
}
Note that 'object_replication_last' and 'object_replication_time' are
considered to be transitional and will be removed in the subsequent
releases. Use 'replication_last' and 'replication_time' instead.
Additionaly this patch adds the count in swift-recon and it will be
showed as follows:
$ swift-recon object -r
========================================================================
=======
--> Starting reconnaissance on 4 hosts
========================================================================
=======
[2014-11-27 16:14:09] Checking on replication
[replication_failure] low: 0, high: 0, avg: 0.0, total: 0, Failed: 0.0%,
no_result: 0, reported: 4
[replication_success] low: 3, high: 3, avg: 3.0, total: 12,
Failed: 0.0%, no_result: 0, reported: 4
[replication_time] low: 0, high: 0, avg: 0.0, total: 0, Failed: 0.0%,
no_result: 0, reported: 4
[replication_attempted] low: 1, high: 1, avg: 1.0, total: 4,
Failed: 0.0%, no_result: 0, reported: 4
Oldest completion was 2014-11-27 16:09:45 (4 minutes ago) by
192.168.0.4:6002.
Most recent completion was 2014-11-27 16:14:19 (-10 seconds ago) by
192.168.0.1:6002.
========================================================================
=======
In case there is a cluster which has servers, a server runs with this
patch and the other servers run without this patch. If swift-recon
executes on the server which runs with this patch, there are unnecessary
information on the output such as [failure], [success] and [attempted].
Because other servers which run without this patch are not able to
send a response with information that this patch needs.
Therefore once you apply this patch, you also apply this patch to other
servers before you execute swift-recon.
DocImpact
Change-Id: Iecd33655ae2568482833131f422679996c374d78
Co-Authored-By: Kenichiro Matsuda <matsuda_kenichi@jp.fujitsu.com>
Co-Authored-By: Brian Cline <bcline@softlayer.com>
Implements: blueprint enable-object-replication-failure-in-recon
2014-12-03 06:15:16 +09:00
|
|
|
self.replicator._zero_stats()
|
2015-02-26 16:37:16 -08:00
|
|
|
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2015-02-26 16:37:16 -08:00
|
|
|
mkdirs(df._datadir)
|
2014-10-28 09:51:06 -07:00
|
|
|
ts = normalize_timestamp(time.time())
|
|
|
|
f = open(os.path.join(df._datadir, ts + '.data'),
|
2015-02-26 16:37:16 -08:00
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2015-02-26 16:37:16 -08:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
whole_path_from = storage_directory(self.objects, 1, ohash)
|
|
|
|
suffix_dir_path = os.path.dirname(whole_path_from)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
def _fake_ssync(node, job, suffixes, **kwargs):
|
2014-10-28 09:51:06 -07:00
|
|
|
return True, {ohash: ts}
|
2015-02-26 16:37:16 -08:00
|
|
|
|
|
|
|
self.replicator.sync_method = _fake_ssync
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertFalse(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
|
2014-03-18 10:50:17 -07:00
|
|
|
def test_delete_partition_1(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
2015-03-17 08:32:57 +00:00
|
|
|
policy=POLICIES[1])
|
2014-03-18 10:50:17 -07:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2014-03-18 10:50:17 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects_1, '1', data_dir)
|
|
|
|
part_path = os.path.join(self.objects_1, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[1])
|
2014-03-18 10:50:17 -07:00
|
|
|
nodes = [node for node in
|
|
|
|
ring.get_part_nodes(1)
|
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for node in nodes:
|
|
|
|
rsync_mod = '%s::object/sda/objects-1/%s' % (node['ip'], 1)
|
|
|
|
process_arg_checker.append(
|
|
|
|
(0, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
|
2013-08-22 19:23:29 +00:00
|
|
|
def test_delete_partition_with_failures(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
2013-08-22 19:23:29 +00:00
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2013-08-22 19:23:29 +00:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, '1', data_dir)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[0])
|
2013-08-22 19:23:29 +00:00
|
|
|
nodes = [node for node in
|
2014-03-18 10:50:17 -07:00
|
|
|
ring.get_part_nodes(1)
|
2013-08-22 19:23:29 +00:00
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for i, node in enumerate(nodes):
|
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'], 1)
|
|
|
|
if i == 0:
|
|
|
|
# force one of the rsync calls to fail
|
|
|
|
ret_code = 1
|
|
|
|
else:
|
|
|
|
ret_code = 0
|
|
|
|
process_arg_checker.append(
|
|
|
|
(ret_code, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The path should still exist
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
def test_delete_partition_with_handoff_delete(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
self.replicator.handoff_delete = 2
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
2013-08-22 19:23:29 +00:00
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2013-08-22 19:23:29 +00:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, '1', data_dir)
|
2013-07-30 17:16:59 +02:00
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[0])
|
2013-08-22 19:23:29 +00:00
|
|
|
nodes = [node for node in
|
2014-03-18 10:50:17 -07:00
|
|
|
ring.get_part_nodes(1)
|
2013-08-22 19:23:29 +00:00
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for i, node in enumerate(nodes):
|
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'], 1)
|
|
|
|
if i == 0:
|
|
|
|
# force one of the rsync calls to fail
|
|
|
|
ret_code = 1
|
|
|
|
else:
|
|
|
|
ret_code = 0
|
|
|
|
process_arg_checker.append(
|
|
|
|
(ret_code, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
2013-07-30 17:16:59 +02:00
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
2010-11-05 09:15:31 -07:00
|
|
|
|
2013-08-22 19:23:29 +00:00
|
|
|
def test_delete_partition_with_handoff_delete_failures(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
self.replicator.handoff_delete = 2
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
2013-08-22 19:23:29 +00:00
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2013-08-22 19:23:29 +00:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, '1', data_dir)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[0])
|
2013-08-22 19:23:29 +00:00
|
|
|
nodes = [node for node in
|
2014-03-18 10:50:17 -07:00
|
|
|
ring.get_part_nodes(1)
|
2013-08-22 19:23:29 +00:00
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for i, node in enumerate(nodes):
|
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'], 1)
|
|
|
|
if i in (0, 1):
|
|
|
|
# force two of the rsync calls to fail
|
|
|
|
ret_code = 1
|
|
|
|
else:
|
|
|
|
ret_code = 0
|
|
|
|
process_arg_checker.append(
|
|
|
|
(ret_code, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
2014-05-29 00:54:07 -07:00
|
|
|
def test_delete_partition_with_handoff_delete_fail_in_other_region(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2014-05-29 00:54:07 -07:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2014-05-29 00:54:07 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, '1', data_dir)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = self.replicator.load_object_ring(POLICIES[0])
|
2014-05-29 00:54:07 -07:00
|
|
|
nodes = [node for node in
|
|
|
|
ring.get_part_nodes(1)
|
|
|
|
if node['ip'] not in _ips()]
|
|
|
|
process_arg_checker = []
|
|
|
|
for node in nodes:
|
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'], 1)
|
|
|
|
if node['region'] != 1:
|
|
|
|
# the rsync calls for other region to fail
|
|
|
|
ret_code = 1
|
|
|
|
else:
|
|
|
|
ret_code = 0
|
|
|
|
process_arg_checker.append(
|
|
|
|
(ret_code, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
2012-09-28 12:24:15 -07:00
|
|
|
def test_delete_partition_override_params(self):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '0', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
2012-09-28 12:24:15 -07:00
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate(override_devices=['sdb'])
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
2019-04-02 15:43:16 +08:00
|
|
|
self.replicator.replicate(override_partitions=[9])
|
2012-09-28 12:24:15 -07:00
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate(override_devices=['sda'],
|
2019-04-02 15:43:16 +08:00
|
|
|
override_partitions=[1])
|
2012-09-28 12:24:15 -07:00
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
|
2015-01-22 15:26:19 -08:00
|
|
|
def test_delete_policy_override_params(self):
|
2015-03-17 08:32:57 +00:00
|
|
|
df0 = self.df_mgr.get_diskfile('sda', '99', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2015-01-22 15:26:19 -08:00
|
|
|
df1 = self.df_mgr.get_diskfile('sda', '99', 'a', 'c', 'o',
|
2015-03-17 08:32:57 +00:00
|
|
|
policy=POLICIES[1])
|
2015-01-22 15:26:19 -08:00
|
|
|
mkdirs(df0._datadir)
|
|
|
|
mkdirs(df1._datadir)
|
|
|
|
|
|
|
|
pol0_part_path = os.path.join(self.objects, '99')
|
|
|
|
pol1_part_path = os.path.join(self.objects_1, '99')
|
|
|
|
|
|
|
|
# sanity checks
|
|
|
|
self.assertTrue(os.access(pol0_part_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(pol1_part_path, os.F_OK))
|
|
|
|
|
|
|
|
# a bogus policy index doesn't bother the replicator any more than a
|
|
|
|
# bogus device or partition does
|
|
|
|
self.replicator.run_once(policies='1,2,5')
|
|
|
|
|
|
|
|
self.assertFalse(os.access(pol1_part_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(pol0_part_path, os.F_OK))
|
|
|
|
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
# since we weren't operating on everything, but only a subset of
|
|
|
|
# storage policies, we didn't dump any recon stats.
|
|
|
|
self.assertFalse(os.path.exists(
|
|
|
|
os.path.join(self.recon_cache, 'object.recon')))
|
|
|
|
|
2014-05-29 00:54:07 -07:00
|
|
|
def test_delete_partition_ssync(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2014-05-29 00:54:07 -07:00
|
|
|
mkdirs(df._datadir)
|
2014-10-28 09:51:06 -07:00
|
|
|
ts = normalize_timestamp(time.time())
|
|
|
|
f = open(os.path.join(df._datadir, ts + '.data'),
|
2014-05-29 00:54:07 -07:00
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'0')
|
2014-05-29 00:54:07 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
whole_path_from = storage_directory(self.objects, 1, ohash)
|
|
|
|
suffix_dir_path = os.path.dirname(whole_path_from)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
self.call_nums = 0
|
|
|
|
self.conf['sync_method'] = 'ssync'
|
|
|
|
|
|
|
|
def _fake_ssync(node, job, suffixes, **kwargs):
|
|
|
|
success = True
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {ohash: ts}
|
2014-05-29 00:54:07 -07:00
|
|
|
if self.call_nums == 2:
|
|
|
|
# ssync should return (True, []) only when the second
|
|
|
|
# candidate node has not get the replica yet.
|
|
|
|
success = False
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {}
|
2014-05-29 00:54:07 -07:00
|
|
|
self.call_nums += 1
|
2014-10-28 09:51:06 -07:00
|
|
|
return success, ret_val
|
2014-05-29 00:54:07 -07:00
|
|
|
|
|
|
|
self.replicator.sync_method = _fake_ssync
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should be deleted at the second replicate call
|
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertFalse(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The partition should be deleted at the third replicate call
|
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertFalse(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
del self.call_nums
|
|
|
|
|
|
|
|
def test_delete_partition_ssync_with_sync_failure(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2014-10-28 09:51:06 -07:00
|
|
|
ts = normalize_timestamp(time.time())
|
2014-05-29 00:54:07 -07:00
|
|
|
mkdirs(df._datadir)
|
2014-10-28 09:51:06 -07:00
|
|
|
f = open(os.path.join(df._datadir, ts + '.data'), 'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'0')
|
2014-05-29 00:54:07 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
whole_path_from = storage_directory(self.objects, 1, ohash)
|
|
|
|
suffix_dir_path = os.path.dirname(whole_path_from)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.call_nums = 0
|
|
|
|
self.conf['sync_method'] = 'ssync'
|
|
|
|
|
2015-02-12 16:15:42 -08:00
|
|
|
def _fake_ssync(node, job, suffixes, **kwags):
|
2014-05-29 00:54:07 -07:00
|
|
|
success = False
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {}
|
2014-05-29 00:54:07 -07:00
|
|
|
if self.call_nums == 2:
|
|
|
|
# ssync should return (True, []) only when the second
|
|
|
|
# candidate node has not get the replica yet.
|
|
|
|
success = True
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {ohash: ts}
|
2014-05-29 00:54:07 -07:00
|
|
|
self.call_nums += 1
|
2014-10-28 09:51:06 -07:00
|
|
|
return success, ret_val
|
2014-05-29 00:54:07 -07:00
|
|
|
|
|
|
|
self.replicator.sync_method = _fake_ssync
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
del self.call_nums
|
|
|
|
|
2015-02-12 16:18:54 -08:00
|
|
|
def test_delete_objs_ssync_only_when_in_sync(self):
|
|
|
|
self.replicator.logger = debug_logger('test-replicator')
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2015-02-12 16:18:54 -08:00
|
|
|
mkdirs(df._datadir)
|
2014-10-28 09:51:06 -07:00
|
|
|
ts = normalize_timestamp(time.time())
|
|
|
|
f = open(os.path.join(df._datadir, ts + '.data'), 'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'0')
|
2015-02-12 16:18:54 -08:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
whole_path_from = storage_directory(self.objects, 1, ohash)
|
|
|
|
suffix_dir_path = os.path.dirname(whole_path_from)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.call_nums = 0
|
|
|
|
self.conf['sync_method'] = 'ssync'
|
|
|
|
|
2014-10-28 09:51:06 -07:00
|
|
|
in_sync_objs = {}
|
2015-02-12 16:18:54 -08:00
|
|
|
|
|
|
|
def _fake_ssync(node, job, suffixes, remote_check_objs=None):
|
|
|
|
self.call_nums += 1
|
|
|
|
if remote_check_objs is None:
|
|
|
|
# sync job
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {ohash: ts}
|
2015-02-12 16:18:54 -08:00
|
|
|
else:
|
|
|
|
ret_val = in_sync_objs
|
2014-10-28 09:51:06 -07:00
|
|
|
return True, ret_val
|
2015-02-12 16:18:54 -08:00
|
|
|
|
|
|
|
self.replicator.sync_method = _fake_ssync
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertEqual(3, self.call_nums)
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
del self.call_nums
|
|
|
|
|
2014-05-29 00:54:07 -07:00
|
|
|
def test_delete_partition_ssync_with_cleanup_failure(self):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
2015-03-31 22:35:37 -07:00
|
|
|
self.replicator.logger = mock_logger = \
|
|
|
|
debug_logger('test-replicator')
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', '1', 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
2014-05-29 00:54:07 -07:00
|
|
|
mkdirs(df._datadir)
|
2014-10-28 09:51:06 -07:00
|
|
|
ts = normalize_timestamp(time.time())
|
|
|
|
f = open(os.path.join(df._datadir, ts + '.data'), 'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'0')
|
2014-05-29 00:54:07 -07:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
whole_path_from = storage_directory(self.objects, 1, ohash)
|
|
|
|
suffix_dir_path = os.path.dirname(whole_path_from)
|
|
|
|
part_path = os.path.join(self.objects, '1')
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
self.call_nums = 0
|
|
|
|
self.conf['sync_method'] = 'ssync'
|
|
|
|
|
|
|
|
def _fake_ssync(node, job, suffixes, **kwargs):
|
|
|
|
success = True
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {ohash: ts}
|
2014-05-29 00:54:07 -07:00
|
|
|
if self.call_nums == 2:
|
|
|
|
# ssync should return (True, []) only when the second
|
|
|
|
# candidate node has not get the replica yet.
|
|
|
|
success = False
|
2014-10-28 09:51:06 -07:00
|
|
|
ret_val = {}
|
2014-05-29 00:54:07 -07:00
|
|
|
self.call_nums += 1
|
2014-10-28 09:51:06 -07:00
|
|
|
return success, ret_val
|
2014-05-29 00:54:07 -07:00
|
|
|
|
|
|
|
rmdir_func = os.rmdir
|
|
|
|
|
|
|
|
def raise_exception_rmdir(exception_class, error_no):
|
|
|
|
instance = exception_class()
|
|
|
|
instance.errno = error_no
|
2017-08-22 22:40:58 +00:00
|
|
|
instance.strerror = os.strerror(error_no)
|
2014-05-29 00:54:07 -07:00
|
|
|
|
|
|
|
def func(directory):
|
|
|
|
if directory == suffix_dir_path:
|
|
|
|
raise instance
|
|
|
|
else:
|
|
|
|
rmdir_func(directory)
|
|
|
|
|
|
|
|
return func
|
|
|
|
|
|
|
|
self.replicator.sync_method = _fake_ssync
|
|
|
|
self.replicator.replicate()
|
|
|
|
# The file should still exist
|
|
|
|
self.assertTrue(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
# Fail with ENOENT
|
|
|
|
with mock.patch('os.rmdir',
|
|
|
|
raise_exception_rmdir(OSError, ENOENT)):
|
|
|
|
self.replicator.replicate()
|
2015-03-31 22:35:37 -07:00
|
|
|
self.assertFalse(mock_logger.get_lines_for_level('error'))
|
2014-05-29 00:54:07 -07:00
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
# Fail with ENOTEMPTY
|
|
|
|
with mock.patch('os.rmdir',
|
|
|
|
raise_exception_rmdir(OSError, ENOTEMPTY)):
|
|
|
|
self.replicator.replicate()
|
2015-03-31 22:35:37 -07:00
|
|
|
self.assertFalse(mock_logger.get_lines_for_level('error'))
|
2014-05-29 00:54:07 -07:00
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
# Fail with ENOTDIR
|
|
|
|
with mock.patch('os.rmdir',
|
|
|
|
raise_exception_rmdir(OSError, ENOTDIR)):
|
|
|
|
self.replicator.replicate()
|
2017-08-22 22:40:58 +00:00
|
|
|
self.assertEqual(mock_logger.get_lines_for_level('error'), [
|
|
|
|
'Unexpected error trying to cleanup suffix dir:%r: ' %
|
|
|
|
os.path.dirname(df._datadir),
|
|
|
|
])
|
2014-05-29 00:54:07 -07:00
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertTrue(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
|
|
|
|
# Finally we can cleanup everything
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertFalse(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertTrue(os.access(part_path, os.F_OK))
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertFalse(os.access(whole_path_from, os.F_OK))
|
|
|
|
self.assertFalse(os.access(suffix_dir_path, os.F_OK))
|
|
|
|
self.assertFalse(os.access(part_path, os.F_OK))
|
|
|
|
|
2010-11-05 09:15:31 -07:00
|
|
|
def test_run_once_recover_from_failure(self):
|
2014-03-18 10:50:17 -07:00
|
|
|
conf = dict(swift_dir=self.testdir, devices=self.devices,
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
bind_ip=_ips()[0],
|
2014-03-18 10:50:17 -07:00
|
|
|
mount_check='false', timeout='300', stats_interval='1')
|
|
|
|
replicator = object_replicator.ObjectReplicator(conf)
|
2010-11-05 09:15:31 -07:00
|
|
|
was_connector = object_replicator.http_connect
|
2011-11-28 09:13:41 -08:00
|
|
|
try:
|
|
|
|
object_replicator.http_connect = mock_http_connect(200)
|
|
|
|
# Write some files into '1' and run replicate- they should be moved
|
2014-09-18 21:16:35 -07:00
|
|
|
# to the other partitions and then node should get deleted.
|
2011-11-28 09:13:41 -08:00
|
|
|
cur_part = '1'
|
2015-03-17 08:32:57 +00:00
|
|
|
df = self.df_mgr.get_diskfile('sda', cur_part, 'a', 'c', 'o',
|
|
|
|
policy=POLICIES.legacy)
|
DiskFile API, with reference implementation
Refactor on-disk knowledge out of the object server by pushing the
async update pickle creation to the new DiskFileManager class (name is
not the best, so suggestions welcome), along with the REPLICATOR
method logic. We also move the mount checking and thread pool storage
to the new ondisk.Devices object, which then also becomes the new home
of the audit_location_generator method.
For the object server, a new setup() method is now called at the end
of the controller's construction, and the _diskfile() method has been
renamed to get_diskfile(), to allow implementation specific behavior.
We then hide the need for the REST API layer to know how and where
quarantining needs to be performed. There are now two places it is
checked internally, on open() where we verify the content-length,
name, and x-timestamp metadata, and in the reader on close where the
etag metadata is checked if the entire file was read.
We add a reader class to allow implementations to isolate the WSGI
handling code for that specific environment (it is used no-where else
in the REST APIs). This simplifies the caller's code to just use a
"with" statement once open to avoid multiple points where close needs
to be called.
For a full historical comparison, including the usage patterns see:
https://gist.github.com/portante/5488238
(as of master, 2b639f5, Merge
"Fix 500 from account-quota This Commit
middleware")
--------------------------------+------------------------------------
DiskFileManager(conf)
Methods:
.pickle_async_update()
.get_diskfile()
.get_hashes()
Attributes:
.devices
.logger
.disk_chunk_size
.keep_cache_size
.bytes_per_sync
DiskFile(a,c,o,keep_data_fp=) DiskFile(a,c,o)
Methods: Methods:
*.__iter__()
.close(verify_file=)
.is_deleted()
.is_expired()
.quarantine()
.get_data_file_size()
.open()
.read_metadata()
.create() .create()
.write_metadata()
.delete() .delete()
Attributes: Attributes:
.quarantined_dir
.keep_cache
.metadata
*DiskFileReader()
Methods:
.__iter__()
.close()
Attributes:
+.was_quarantined
DiskWriter() DiskFileWriter()
Methods: Methods:
.write() .write()
.put() .put()
* Note that the DiskFile class * Note that the DiskReader() object
implements all the methods returned by the
necessary for a WSGI app DiskFileOpened.reader() method
iterator implements all the methods
necessary for a WSGI app iterator
+ Note that if the auditor is
refactored to not use the DiskFile
class, see
https://review.openstack.org/44787
then we don't need the
was_quarantined attribute
A reference "in-memory" object server implementation of a backend
DiskFile class in swift/obj/mem_server.py and
swift/obj/mem_diskfile.py.
One can also reference
https://github.com/portante/gluster-swift/commits/diskfile for the
proposed integration with the gluster-swift code based on these
changes.
Change-Id: I44e153fdb405a5743e9c05349008f94136764916
Signed-off-by: Peter Portante <peter.portante@redhat.com>
2013-09-12 19:51:18 -04:00
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
2011-11-28 09:13:41 -08:00
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2011-11-28 09:13:41 -08:00
|
|
|
f.close()
|
|
|
|
ohash = hash_path('a', 'c', 'o')
|
|
|
|
data_dir = ohash[-3:]
|
|
|
|
whole_path_from = os.path.join(self.objects, cur_part, data_dir)
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = replicator.load_object_ring(POLICIES[0])
|
2011-11-28 09:13:41 -08:00
|
|
|
process_arg_checker = []
|
|
|
|
nodes = [node for node in
|
2014-03-18 10:50:17 -07:00
|
|
|
ring.get_part_nodes(int(cur_part))
|
2013-08-31 23:42:43 -04:00
|
|
|
if node['ip'] not in _ips()]
|
2011-11-28 09:13:41 -08:00
|
|
|
for node in nodes:
|
2012-06-04 13:27:39 +02:00
|
|
|
rsync_mod = '%s::object/sda/objects/%s' % (node['ip'],
|
|
|
|
cur_part)
|
2011-11-28 09:13:41 -08:00
|
|
|
process_arg_checker.append(
|
|
|
|
(0, '', ['rsync', whole_path_from, rsync_mod]))
|
|
|
|
self.assertTrue(os.access(os.path.join(self.objects,
|
|
|
|
'1', data_dir, ohash),
|
|
|
|
os.F_OK))
|
|
|
|
with _mock_process(process_arg_checker):
|
|
|
|
replicator.run_once()
|
|
|
|
self.assertFalse(process_errors)
|
|
|
|
for i, result in [('0', True), ('1', False),
|
|
|
|
('2', True), ('3', True)]:
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(os.access(
|
2013-08-31 23:42:43 -04:00
|
|
|
os.path.join(self.objects,
|
|
|
|
i, diskfile.HASH_FILE),
|
|
|
|
os.F_OK), result)
|
2011-11-28 09:13:41 -08:00
|
|
|
finally:
|
|
|
|
object_replicator.http_connect = was_connector
|
|
|
|
|
|
|
|
def test_run_once_recover_from_timeout(self):
|
2016-10-30 22:24:18 -07:00
|
|
|
# verify that replicator will pass over all policies' partitions even
|
|
|
|
# if a timeout occurs while replicating one partition to one node.
|
|
|
|
timeouts = [Timeout()]
|
|
|
|
|
Modify _get_hashes() arguments to be more generic
Some public functions in the diskfile manager expect or return full
file paths. It implies a filesystem diskfile implementation.
To make it easier to plug alternate diskfile implementations, patch
functions to take more generic arguments.
This commit changes DiskFileManager _get_hashes() arguments from:
- partition_path, recalculate=None, do_listdir=False
to :
- device, partition, policy, recalculate=None, do_listdir=False
Callers are modified accordingly, in diskfile.py, reconstructor.py,
and replicator.py
Change-Id: I8e2d7075572e466ae2fa5ebef5e31d87eed90fec
2017-03-24 15:41:45 +01:00
|
|
|
def fake_get_hashes(df_mgr, device, partition, policy, **kwargs):
|
2016-10-30 22:24:18 -07:00
|
|
|
self.get_hash_count += 1
|
Modify _get_hashes() arguments to be more generic
Some public functions in the diskfile manager expect or return full
file paths. It implies a filesystem diskfile implementation.
To make it easier to plug alternate diskfile implementations, patch
functions to take more generic arguments.
This commit changes DiskFileManager _get_hashes() arguments from:
- partition_path, recalculate=None, do_listdir=False
to :
- device, partition, policy, recalculate=None, do_listdir=False
Callers are modified accordingly, in diskfile.py, reconstructor.py,
and replicator.py
Change-Id: I8e2d7075572e466ae2fa5ebef5e31d87eed90fec
2017-03-24 15:41:45 +01:00
|
|
|
dev_path = df_mgr.get_dev_path(device)
|
|
|
|
part_path = os.path.join(dev_path, diskfile.get_data_dir(policy),
|
|
|
|
str(partition))
|
2016-10-30 22:24:18 -07:00
|
|
|
# Simulate a REPLICATE timeout by raising Timeout for second call
|
|
|
|
# to get_hashes (with recalculate suffixes) for a specific
|
|
|
|
# partition
|
|
|
|
if (timeouts and '/objects/' in part_path and
|
|
|
|
part_path.endswith('0') and 'recalculate' in kwargs):
|
|
|
|
raise timeouts.pop(0)
|
|
|
|
return 1, {'abc': 'def'}
|
|
|
|
|
|
|
|
# map partition_path -> [nodes]
|
|
|
|
sync_paths = collections.defaultdict(list)
|
|
|
|
|
|
|
|
def fake_sync(node, job, suffixes, *args, **kwargs):
|
|
|
|
sync_paths[job['path']].append(node)
|
|
|
|
return True, {}
|
|
|
|
|
2014-03-18 10:50:17 -07:00
|
|
|
conf = dict(swift_dir=self.testdir, devices=self.devices,
|
2016-10-30 22:24:18 -07:00
|
|
|
bind_ip=_ips()[0], # local dev has id=0
|
2014-03-18 10:50:17 -07:00
|
|
|
mount_check='false', timeout='300', stats_interval='1')
|
2016-10-30 22:24:18 -07:00
|
|
|
with mock.patch('swift.obj.diskfile.DiskFileManager._get_hashes',
|
|
|
|
fake_get_hashes):
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
with mock.patch('swift.obj.replicator.dump_recon_cache'):
|
|
|
|
replicator = object_replicator.ObjectReplicator(
|
2017-08-22 22:40:58 +00:00
|
|
|
conf, logger=self.logger)
|
2016-10-30 22:24:18 -07:00
|
|
|
|
|
|
|
self.get_hash_count = 0
|
|
|
|
with mock.patch.object(replicator, 'sync', fake_sync):
|
|
|
|
replicator.run_once()
|
|
|
|
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
log_lines = replicator.logger.logger.get_lines_for_level('error')
|
2016-10-30 22:24:18 -07:00
|
|
|
self.assertIn("Error syncing with node:", log_lines[0])
|
|
|
|
self.assertFalse(log_lines[1:])
|
|
|
|
# setup creates 4 partitions; partition 1 does not map to local dev id
|
|
|
|
# 0 so will be handled by update_delete(); partitions 0, 2, 3 are
|
|
|
|
# handled by update() for each of two policies, so expect 6 paths to be
|
|
|
|
# sync'd
|
|
|
|
self.assertEqual(6, len(sync_paths))
|
|
|
|
# partition 3 has 2 nodes in remote region, only first node is sync'd.
|
|
|
|
# partition 0 in policy 0 has fake_get_hashes timeout before first
|
|
|
|
# sync, so only second node is sync'd.
|
|
|
|
# other partitions are sync'd to 2 nodes in same region.
|
|
|
|
expected_node_count = { # map path_end -> expected sync node count
|
|
|
|
'/objects/0': 1,
|
|
|
|
'/objects/1': 2,
|
|
|
|
'/objects/2': 2,
|
|
|
|
'/objects/3': 1,
|
|
|
|
'/objects-1/0': 2,
|
|
|
|
'/objects-1/1': 2,
|
|
|
|
'/objects-1/2': 2,
|
|
|
|
'/objects-1/3': 1
|
|
|
|
}
|
|
|
|
for path, nodes in sync_paths.items():
|
|
|
|
path_end = path[path.index('/objects'):]
|
|
|
|
self.assertEqual(expected_node_count[path_end], len(nodes),
|
|
|
|
'Expected %s but got %s for path %s' %
|
|
|
|
(expected_node_count[path_end], len(nodes), path))
|
|
|
|
# partitions 0 and 2 attempt 3 calls each per policy to get_hashes = 12
|
|
|
|
# partitions 3 attempts 2 calls per policy to get_hashes = 4
|
|
|
|
# partitions 1 dosn't get_hashes because of update_deleted
|
|
|
|
self.assertEqual(16, self.get_hash_count)
|
|
|
|
|
|
|
|
# attempt to 16 times but succeeded only 15 times due to Timeout
|
|
|
|
suffix_hashes = sum(
|
|
|
|
count for (metric, count), _junk in
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
replicator.logger.logger.log_dict['update_stats']
|
2016-10-30 22:24:18 -07:00
|
|
|
if metric == 'suffix.hashes')
|
|
|
|
self.assertEqual(15, suffix_hashes)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def test_run(self):
|
2010-11-16 08:32:03 -08:00
|
|
|
with _mock_process([(0, '')] * 100):
|
2013-08-05 15:35:34 -07:00
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
self.replicator.replicate()
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def test_run_withlog(self):
|
2010-11-16 08:32:03 -08:00
|
|
|
with _mock_process([(0, "stuff in log")] * 100):
|
2013-08-05 15:35:34 -07:00
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)):
|
|
|
|
self.replicator.replicate()
|
2010-07-12 17:03:45 -05:00
|
|
|
|
Object replication ssync (an rsync alternative)
For this commit, ssync is just a direct replacement for how
we use rsync. Assuming we switch over to ssync completely
someday and drop rsync, we will then be able to improve the
algorithms even further (removing local objects as we
successfully transfer each one rather than waiting for whole
partitions, using an index.db with hash-trees, etc., etc.)
For easier review, this commit can be thought of in distinct
parts:
1) New global_conf_callback functionality for allowing
services to perform setup code before workers, etc. are
launched. (This is then used by ssync in the object
server to create a cross-worker semaphore to restrict
concurrent incoming replication.)
2) A bit of shifting of items up from object server and
replicator to diskfile or DEFAULT conf sections for
better sharing of the same settings. conn_timeout,
node_timeout, client_timeout, network_chunk_size,
disk_chunk_size.
3) Modifications to the object server and replicator to
optionally use ssync in place of rsync. This is done in
a generic enough way that switching to FutureSync should
be easy someday.
4) The biggest part, and (at least for now) completely
optional part, are the new ssync_sender and
ssync_receiver files. Nice and isolated for easier
testing and visibility into test coverage, etc.
All the usual logging, statsd, recon, etc. instrumentation
is still there when using ssync, just as it is when using
rsync.
Beyond the essential error and exceptional condition
logging, I have not added any additional instrumentation at
this time. Unless there is something someone finds super
pressing to have added to the logging, I think such
additions would be better as separate change reviews.
FOR NOW, IT IS NOT RECOMMENDED TO USE SSYNC ON PRODUCTION
CLUSTERS. Some of us will be in a limited fashion to look
for any subtle issues, tuning, etc. but generally ssync is
an experimental feature. In its current implementation it is
probably going to be a bit slower than rsync, but if all
goes according to plan it will end up much faster.
There are no comparisions yet between ssync and rsync other
than some raw virtual machine testing I've done to show it
should compete well enough once we can put it in use in the
real world.
If you Tweet, Google+, or whatever, be sure to indicate it's
experimental. It'd be best to keep it out of deployment
guides, howtos, etc. until we all figure out if we like it,
find it to be stable, etc.
Change-Id: If003dcc6f4109e2d2a42f4873a0779110fff16d6
2013-08-28 16:10:43 +00:00
|
|
|
def test_sync_just_calls_sync_method(self):
|
|
|
|
self.replicator.sync_method = mock.MagicMock()
|
|
|
|
self.replicator.sync('node', 'job', 'suffixes')
|
|
|
|
self.replicator.sync_method.assert_called_once_with(
|
|
|
|
'node', 'job', 'suffixes')
|
|
|
|
|
2018-05-22 10:27:46 -07:00
|
|
|
@mock.patch('swift.obj.replicator.tpool.execute')
|
2012-12-17 06:39:25 -05:00
|
|
|
@mock.patch('swift.obj.replicator.http_connect', autospec=True)
|
2016-11-25 09:26:27 +01:00
|
|
|
@mock.patch('swift.obj.replicator._do_listdir')
|
2018-05-22 10:27:46 -07:00
|
|
|
def test_update(self, mock_do_listdir, mock_http, mock_tpool_execute):
|
2012-12-17 06:39:25 -05:00
|
|
|
|
|
|
|
def set_default(self):
|
|
|
|
self.replicator.suffix_count = 0
|
|
|
|
self.replicator.suffix_sync = 0
|
|
|
|
self.replicator.suffix_hash = 0
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.replicator.last_replication_count = 0
|
|
|
|
self.replicator._zero_stats()
|
2012-12-17 06:39:25 -05:00
|
|
|
self.replicator.partition_times = []
|
|
|
|
|
|
|
|
self.headers = {'Content-Length': '0',
|
Object services user-agent string uses full name
It does not appear that, aside from the user-agent string, the strings
"obj-server", "obj-updater", or "obj-replicator" (or "obj-<anything>"*)
appear in the swift code base, aside from the directory containing the
object services code being named "obj".
Furthermore, the container, account, and proxy services construct their
user-agent string, as reported in the logs, using their full name. In
addition, this full name also shows up as the name of the process via
"ps" or "top", etc., which can make it easier for admins to match log
entries with other tools.
For consistency, we update the object services to use an "object-"
prefix rather than "obj-" in its user agent string.
* obj-etag does appear in a unit test, but not part of the regular
code.
Change-Id: I914fc189514207df2535731eda10cb4b3d30cc6c
2014-06-23 12:59:24 -07:00
|
|
|
'user-agent': 'object-replicator %s' % os.getpid()}
|
2018-05-22 10:27:46 -07:00
|
|
|
mock_tpool_execute.return_value = (0, {})
|
2012-12-17 06:39:25 -05:00
|
|
|
|
|
|
|
all_jobs = self.replicator.collect_jobs()
|
|
|
|
jobs = [job for job in all_jobs if not job['delete']]
|
|
|
|
|
|
|
|
mock_http.return_value = answer = mock.MagicMock()
|
|
|
|
answer.getresponse.return_value = resp = mock.MagicMock()
|
2016-11-25 09:26:27 +01:00
|
|
|
# Check incorrect http_connect with status 507 and
|
2012-12-17 06:39:25 -05:00
|
|
|
# count of attempts and call args
|
|
|
|
resp.status = 507
|
2016-07-18 14:01:57 -07:00
|
|
|
error = '%(replication_ip)s/%(device)s responded as unmounted'
|
2016-11-25 09:26:27 +01:00
|
|
|
expected_listdir_calls = [
|
|
|
|
mock.call(int(job['partition']),
|
|
|
|
self.replicator.replication_cycle)
|
|
|
|
for job in jobs]
|
|
|
|
do_listdir_results = [False, False, True, False, True, False]
|
|
|
|
mock_do_listdir.side_effect = do_listdir_results
|
|
|
|
expected_tpool_calls = [
|
2016-10-30 22:24:18 -07:00
|
|
|
mock.call(self.replicator._df_router[job['policy']]._get_hashes,
|
Modify _get_hashes() arguments to be more generic
Some public functions in the diskfile manager expect or return full
file paths. It implies a filesystem diskfile implementation.
To make it easier to plug alternate diskfile implementations, patch
functions to take more generic arguments.
This commit changes DiskFileManager _get_hashes() arguments from:
- partition_path, recalculate=None, do_listdir=False
to :
- device, partition, policy, recalculate=None, do_listdir=False
Callers are modified accordingly, in diskfile.py, reconstructor.py,
and replicator.py
Change-Id: I8e2d7075572e466ae2fa5ebef5e31d87eed90fec
2017-03-24 15:41:45 +01:00
|
|
|
job['device'], job['partition'], job['policy'],
|
2016-07-25 20:10:44 +05:30
|
|
|
do_listdir=do_listdir)
|
2016-11-25 09:26:27 +01:00
|
|
|
for job, do_listdir in zip(jobs, do_listdir_results)
|
|
|
|
]
|
2012-12-17 06:39:25 -05:00
|
|
|
for job in jobs:
|
|
|
|
set_default(self)
|
2015-03-17 08:32:57 +00:00
|
|
|
ring = job['policy'].object_ring
|
|
|
|
self.headers['X-Backend-Storage-Policy-Index'] = int(job['policy'])
|
2012-12-17 06:39:25 -05:00
|
|
|
self.replicator.update(job)
|
2016-07-18 14:01:57 -07:00
|
|
|
error_lines = self.logger.get_lines_for_level('error')
|
|
|
|
expected = []
|
|
|
|
# ... first the primaries
|
|
|
|
for node in job['nodes']:
|
|
|
|
expected.append(error % node)
|
|
|
|
# ... then it will get handoffs
|
|
|
|
for node in job['policy'].object_ring.get_more_nodes(
|
|
|
|
int(job['partition'])):
|
|
|
|
expected.append(error % node)
|
2017-12-18 19:08:58 +00:00
|
|
|
# ... and finally we get an error about running out of nodes
|
|
|
|
expected.append('Ran out of handoffs while replicating '
|
2018-01-23 18:42:40 +00:00
|
|
|
'partition %s of policy %d' %
|
|
|
|
(job['partition'], job['policy']))
|
2016-07-18 14:01:57 -07:00
|
|
|
self.assertEqual(expected, error_lines)
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(self.replicator.partition_times), 1)
|
|
|
|
self.assertEqual(mock_http.call_count, len(ring._devs) - 1)
|
2012-12-17 06:39:25 -05:00
|
|
|
reqs = []
|
|
|
|
for node in job['nodes']:
|
2013-08-31 23:42:43 -04:00
|
|
|
reqs.append(mock.call(node['ip'], node['port'], node['device'],
|
|
|
|
job['partition'], 'REPLICATE', '',
|
|
|
|
headers=self.headers))
|
2012-12-17 06:39:25 -05:00
|
|
|
if job['partition'] == '0':
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(self.replicator.suffix_hash, 0)
|
2012-12-17 06:39:25 -05:00
|
|
|
mock_http.assert_has_calls(reqs, any_order=True)
|
|
|
|
mock_http.reset_mock()
|
2016-07-18 14:01:57 -07:00
|
|
|
self.logger.clear()
|
2016-11-25 09:26:27 +01:00
|
|
|
mock_do_listdir.assert_has_calls(expected_listdir_calls)
|
2018-05-22 10:27:46 -07:00
|
|
|
mock_tpool_execute.assert_has_calls(expected_tpool_calls)
|
2016-11-25 09:26:27 +01:00
|
|
|
mock_do_listdir.side_effect = None
|
|
|
|
mock_do_listdir.return_value = False
|
|
|
|
# Check incorrect http_connect with status 400 != HTTP_OK
|
2012-12-17 06:39:25 -05:00
|
|
|
resp.status = 400
|
|
|
|
error = 'Invalid response %(resp)s from %(ip)s'
|
|
|
|
for job in jobs:
|
|
|
|
set_default(self)
|
|
|
|
self.replicator.update(job)
|
2016-07-18 14:01:57 -07:00
|
|
|
# ... only the primaries
|
|
|
|
expected = [error % {'resp': 400, 'ip': node['replication_ip']}
|
|
|
|
for node in job['nodes']]
|
|
|
|
self.assertEqual(expected,
|
|
|
|
self.logger.get_lines_for_level('error'))
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(self.replicator.partition_times), 1)
|
2016-07-18 14:01:57 -07:00
|
|
|
self.logger.clear()
|
2012-12-17 06:39:25 -05:00
|
|
|
|
|
|
|
# Check successful http_connection and exception with
|
2016-11-25 09:26:27 +01:00
|
|
|
# incorrect pickle.loads(resp.read())
|
2012-12-17 06:39:25 -05:00
|
|
|
resp.status = 200
|
2017-08-22 22:40:58 +00:00
|
|
|
resp.read.return_value = 'garbage'
|
2016-07-18 14:01:57 -07:00
|
|
|
expect = 'Error syncing with node: %r: '
|
2012-12-17 06:39:25 -05:00
|
|
|
for job in jobs:
|
|
|
|
set_default(self)
|
|
|
|
self.replicator.update(job)
|
2016-07-18 14:01:57 -07:00
|
|
|
# ... only the primaries
|
|
|
|
expected = [expect % node for node in job['nodes']]
|
|
|
|
error_lines = self.logger.get_lines_for_level('error')
|
|
|
|
self.assertEqual(expected, error_lines)
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(self.replicator.partition_times), 1)
|
2016-07-18 14:01:57 -07:00
|
|
|
self.logger.clear()
|
2012-12-17 06:39:25 -05:00
|
|
|
|
|
|
|
# Check successful http_connection and correct
|
|
|
|
# pickle.loads(resp.read()) for non local node
|
|
|
|
resp.status = 200
|
|
|
|
local_job = None
|
|
|
|
resp.read.return_value = pickle.dumps({})
|
|
|
|
for job in jobs:
|
|
|
|
set_default(self)
|
2014-09-18 21:16:35 -07:00
|
|
|
# limit local job to policy 0 for simplicity
|
2015-03-17 08:32:57 +00:00
|
|
|
if job['partition'] == '0' and int(job['policy']) == 0:
|
2012-12-17 06:39:25 -05:00
|
|
|
local_job = job.copy()
|
|
|
|
continue
|
|
|
|
self.replicator.update(job)
|
2016-07-18 14:01:57 -07:00
|
|
|
self.assertEqual([], self.logger.get_lines_for_level('error'))
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(len(self.replicator.partition_times), 1)
|
|
|
|
self.assertEqual(self.replicator.suffix_hash, 0)
|
|
|
|
self.assertEqual(self.replicator.suffix_sync, 0)
|
|
|
|
self.assertEqual(self.replicator.suffix_count, 0)
|
2016-07-18 14:01:57 -07:00
|
|
|
self.logger.clear()
|
2012-12-17 06:39:25 -05:00
|
|
|
|
Object replication ssync (an rsync alternative)
For this commit, ssync is just a direct replacement for how
we use rsync. Assuming we switch over to ssync completely
someday and drop rsync, we will then be able to improve the
algorithms even further (removing local objects as we
successfully transfer each one rather than waiting for whole
partitions, using an index.db with hash-trees, etc., etc.)
For easier review, this commit can be thought of in distinct
parts:
1) New global_conf_callback functionality for allowing
services to perform setup code before workers, etc. are
launched. (This is then used by ssync in the object
server to create a cross-worker semaphore to restrict
concurrent incoming replication.)
2) A bit of shifting of items up from object server and
replicator to diskfile or DEFAULT conf sections for
better sharing of the same settings. conn_timeout,
node_timeout, client_timeout, network_chunk_size,
disk_chunk_size.
3) Modifications to the object server and replicator to
optionally use ssync in place of rsync. This is done in
a generic enough way that switching to FutureSync should
be easy someday.
4) The biggest part, and (at least for now) completely
optional part, are the new ssync_sender and
ssync_receiver files. Nice and isolated for easier
testing and visibility into test coverage, etc.
All the usual logging, statsd, recon, etc. instrumentation
is still there when using ssync, just as it is when using
rsync.
Beyond the essential error and exceptional condition
logging, I have not added any additional instrumentation at
this time. Unless there is something someone finds super
pressing to have added to the logging, I think such
additions would be better as separate change reviews.
FOR NOW, IT IS NOT RECOMMENDED TO USE SSYNC ON PRODUCTION
CLUSTERS. Some of us will be in a limited fashion to look
for any subtle issues, tuning, etc. but generally ssync is
an experimental feature. In its current implementation it is
probably going to be a bit slower than rsync, but if all
goes according to plan it will end up much faster.
There are no comparisions yet between ssync and rsync other
than some raw virtual machine testing I've done to show it
should compete well enough once we can put it in use in the
real world.
If you Tweet, Google+, or whatever, be sure to indicate it's
experimental. It'd be best to keep it out of deployment
guides, howtos, etc. until we all figure out if we like it,
find it to be stable, etc.
Change-Id: If003dcc6f4109e2d2a42f4873a0779110fff16d6
2013-08-28 16:10:43 +00:00
|
|
|
# Check successful http_connect and sync for local node
|
2018-05-22 10:27:46 -07:00
|
|
|
mock_tpool_execute.return_value = (1, {'a83': 'ba47fd314242ec8c'
|
2012-12-17 06:39:25 -05:00
|
|
|
'7efb91f5d57336e4'})
|
|
|
|
resp.read.return_value = pickle.dumps({'a83': 'c130a2c17ed45102a'
|
|
|
|
'ada0f4eee69494ff'})
|
|
|
|
set_default(self)
|
2014-05-29 00:54:07 -07:00
|
|
|
self.replicator.sync = fake_func = \
|
|
|
|
mock.MagicMock(return_value=(True, []))
|
2012-12-17 06:39:25 -05:00
|
|
|
self.replicator.update(local_job)
|
|
|
|
reqs = []
|
|
|
|
for node in local_job['nodes']:
|
2013-07-22 15:27:54 -07:00
|
|
|
reqs.append(mock.call(node, local_job, ['a83']))
|
2012-12-17 06:39:25 -05:00
|
|
|
fake_func.assert_has_calls(reqs, any_order=True)
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(fake_func.call_count, 2)
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
stats = self.replicator.total_stats
|
|
|
|
self.assertEqual(stats.attempted, 1)
|
|
|
|
self.assertEqual(stats.suffix_sync, 2)
|
|
|
|
self.assertEqual(stats.suffix_hash, 1)
|
|
|
|
self.assertEqual(stats.suffix_count, 1)
|
2014-05-29 00:54:07 -07:00
|
|
|
|
|
|
|
# Efficient Replication Case
|
|
|
|
set_default(self)
|
|
|
|
self.replicator.sync = fake_func = \
|
|
|
|
mock.MagicMock(return_value=(True, []))
|
|
|
|
all_jobs = self.replicator.collect_jobs()
|
|
|
|
job = None
|
|
|
|
for tmp in all_jobs:
|
|
|
|
if tmp['partition'] == '3':
|
|
|
|
job = tmp
|
|
|
|
break
|
|
|
|
# The candidate nodes to replicate (i.e. dev1 and dev3)
|
|
|
|
# belong to another region
|
|
|
|
self.replicator.update(job)
|
2015-08-06 00:55:36 +05:30
|
|
|
self.assertEqual(fake_func.call_count, 1)
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
stats = self.replicator.total_stats
|
|
|
|
self.assertEqual(stats.attempted, 1)
|
|
|
|
self.assertEqual(stats.suffix_sync, 1)
|
|
|
|
self.assertEqual(stats.suffix_hash, 1)
|
|
|
|
self.assertEqual(stats.suffix_count, 1)
|
2014-05-29 00:54:07 -07:00
|
|
|
|
2012-12-17 06:39:25 -05:00
|
|
|
mock_http.reset_mock()
|
2016-07-18 14:01:57 -07:00
|
|
|
self.logger.clear()
|
2012-12-17 06:39:25 -05:00
|
|
|
|
2014-03-18 10:50:17 -07:00
|
|
|
# test for replication params on policy 0 only
|
2012-12-17 06:39:25 -05:00
|
|
|
repl_job = local_job.copy()
|
|
|
|
for node in repl_job['nodes']:
|
2013-07-22 15:27:54 -07:00
|
|
|
node['replication_ip'] = '127.0.0.11'
|
|
|
|
node['replication_port'] = '6011'
|
2012-12-17 06:39:25 -05:00
|
|
|
set_default(self)
|
2014-09-18 21:16:35 -07:00
|
|
|
# with only one set of headers make sure we specify index 0 here
|
2014-03-18 10:50:17 -07:00
|
|
|
# as otherwise it may be different from earlier tests
|
2014-06-23 12:52:50 -07:00
|
|
|
self.headers['X-Backend-Storage-Policy-Index'] = 0
|
2012-12-17 06:39:25 -05:00
|
|
|
self.replicator.update(repl_job)
|
|
|
|
reqs = []
|
|
|
|
for node in repl_job['nodes']:
|
2013-08-31 23:42:43 -04:00
|
|
|
reqs.append(mock.call(node['replication_ip'],
|
|
|
|
node['replication_port'], node['device'],
|
|
|
|
repl_job['partition'], 'REPLICATE',
|
|
|
|
'', headers=self.headers))
|
|
|
|
reqs.append(mock.call(node['replication_ip'],
|
|
|
|
node['replication_port'], node['device'],
|
|
|
|
repl_job['partition'], 'REPLICATE',
|
|
|
|
'/a83', headers=self.headers))
|
2012-12-17 06:39:25 -05:00
|
|
|
mock_http.assert_has_calls(reqs, any_order=True)
|
|
|
|
|
2015-01-20 12:14:32 +05:30
|
|
|
def test_rsync_compress_different_region(self):
|
|
|
|
self.assertEqual(self.replicator.sync_method, self.replicator.rsync)
|
|
|
|
jobs = self.replicator.collect_jobs()
|
|
|
|
_m_rsync = mock.Mock(return_value=0)
|
|
|
|
_m_os_path_exists = mock.Mock(return_value=True)
|
2016-07-18 14:01:57 -07:00
|
|
|
with mock.patch.object(self.replicator, '_rsync', _m_rsync), \
|
|
|
|
mock.patch('os.path.exists', _m_os_path_exists):
|
|
|
|
for job in jobs:
|
|
|
|
self.assertTrue('region' in job)
|
|
|
|
for node in job['nodes']:
|
|
|
|
for rsync_compress in (True, False):
|
|
|
|
self.replicator.rsync_compress = rsync_compress
|
|
|
|
ret = self.replicator.sync(node, job,
|
|
|
|
['fake_suffix'])
|
|
|
|
self.assertTrue(ret)
|
|
|
|
if node['region'] != job['region']:
|
|
|
|
if rsync_compress:
|
|
|
|
# --compress arg should be passed to rsync
|
|
|
|
# binary only when rsync_compress option is
|
|
|
|
# enabled AND destination node is in a
|
|
|
|
# different region
|
|
|
|
self.assertTrue('--compress' in
|
|
|
|
_m_rsync.call_args[0][0])
|
2015-01-20 12:14:32 +05:30
|
|
|
else:
|
|
|
|
self.assertFalse('--compress' in
|
|
|
|
_m_rsync.call_args[0][0])
|
2016-07-18 14:01:57 -07:00
|
|
|
else:
|
|
|
|
self.assertFalse('--compress' in
|
|
|
|
_m_rsync.call_args[0][0])
|
|
|
|
self.assertEqual(
|
|
|
|
_m_os_path_exists.call_args_list[-1][0][0],
|
|
|
|
os.path.join(job['path'], 'fake_suffix'))
|
|
|
|
self.assertEqual(
|
|
|
|
_m_os_path_exists.call_args_list[-2][0][0],
|
|
|
|
os.path.join(job['path']))
|
2015-01-20 12:14:32 +05:30
|
|
|
|
2016-11-25 09:26:27 +01:00
|
|
|
def test_do_listdir(self):
|
|
|
|
# Test if do_listdir is enabled for every 10th partition to rehash
|
|
|
|
# First number is the number of partitions in the job, list entries
|
|
|
|
# are the expected partition numbers per run
|
|
|
|
test_data = {
|
|
|
|
9: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
|
|
|
|
29: [3, 2, 3, 3, 3, 3, 3, 3, 3, 3],
|
|
|
|
111: [12, 11, 11, 11, 11, 11, 11, 11, 11, 11]}
|
|
|
|
|
|
|
|
for partitions, expected in test_data.items():
|
|
|
|
seen = []
|
|
|
|
for phase in range(10):
|
|
|
|
invalidated = 0
|
|
|
|
for partition in range(partitions):
|
|
|
|
if object_replicator._do_listdir(partition, phase):
|
|
|
|
seen.append(partition)
|
|
|
|
invalidated += 1
|
|
|
|
# Every 10th partition is seen after each phase
|
|
|
|
self.assertEqual(expected[phase], invalidated)
|
|
|
|
|
|
|
|
# After 10 cycles every partition is seen exactly once
|
|
|
|
self.assertEqual(sorted(range(partitions)), sorted(seen))
|
|
|
|
|
2016-07-04 18:21:54 +02:00
|
|
|
def test_replicate_skipped_partpower_increase(self):
|
|
|
|
_create_test_rings(self.testdir, next_part_power=4)
|
|
|
|
self.replicator.replicate()
|
|
|
|
self.assertEqual(0, self.replicator.job_count)
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
self.assertEqual(0, self.replicator.total_stats.attempted)
|
2016-07-04 18:21:54 +02:00
|
|
|
warnings = self.logger.get_lines_for_level('warning')
|
|
|
|
self.assertIn(
|
|
|
|
"next_part_power set in policy 'one'. Skipping", warnings)
|
|
|
|
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
def test_replicate_rsync_timeout(self):
|
2018-02-26 21:23:55 +09:00
|
|
|
cur_part = '0'
|
|
|
|
df = self.df_mgr.get_diskfile('sda', cur_part, 'a', 'c', 'o',
|
|
|
|
policy=POLICIES[0])
|
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2018-02-26 21:23:55 +09:00
|
|
|
f.close()
|
|
|
|
|
2018-03-03 17:07:54 +00:00
|
|
|
mock_procs = []
|
2018-02-26 21:23:55 +09:00
|
|
|
|
2018-03-03 17:07:54 +00:00
|
|
|
def new_mock(*a, **kw):
|
|
|
|
proc = MockHungProcess()
|
|
|
|
mock_procs.append(proc)
|
|
|
|
return proc
|
|
|
|
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)), \
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
mock.patch.object(self.replicator, 'rsync_timeout', 0.01), \
|
2018-03-03 17:07:54 +00:00
|
|
|
mock.patch('eventlet.green.subprocess.Popen', new_mock):
|
2018-05-22 12:57:16 +01:00
|
|
|
self.replicator.rsync_error_log_line_length = 20
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
self.replicator.run_once()
|
2018-03-03 17:07:54 +00:00
|
|
|
for proc in mock_procs:
|
|
|
|
self.assertEqual(proc._calls, [
|
|
|
|
('wait', 'running'),
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
('kill', 'running'),
|
|
|
|
('wait', 'killed'),
|
2018-03-03 17:07:54 +00:00
|
|
|
])
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
self.assertEqual(len(mock_procs), 2)
|
2018-05-22 12:57:16 +01:00
|
|
|
error_lines = self.replicator.logger.get_lines_for_level('error')
|
|
|
|
# verify logs are truncated to rsync_error_log_line_length
|
|
|
|
self.assertEqual('Killing long-running', error_lines[0])
|
|
|
|
self.assertEqual('Killing long-running', error_lines[1])
|
2018-02-26 21:23:55 +09:00
|
|
|
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
def test_replicate_rsync_timeout_wedged(self):
|
2018-03-03 17:07:54 +00:00
|
|
|
cur_part = '0'
|
|
|
|
df = self.df_mgr.get_diskfile('sda', cur_part, 'a', 'c', 'o',
|
|
|
|
policy=POLICIES[0])
|
|
|
|
mkdirs(df._datadir)
|
|
|
|
f = open(os.path.join(df._datadir,
|
|
|
|
normalize_timestamp(time.time()) + '.data'),
|
|
|
|
'wb')
|
2019-01-29 03:28:26 +01:00
|
|
|
f.write(b'1234567890')
|
2018-03-03 17:07:54 +00:00
|
|
|
f.close()
|
|
|
|
|
|
|
|
mock_procs = []
|
2018-02-26 21:23:55 +09:00
|
|
|
|
2018-03-03 17:07:54 +00:00
|
|
|
def new_mock(*a, **kw):
|
2018-04-03 16:40:06 -07:00
|
|
|
proc = MockHungProcess(polls_needed=2)
|
2018-03-03 17:07:54 +00:00
|
|
|
mock_procs.append(proc)
|
|
|
|
return proc
|
2018-02-26 21:23:55 +09:00
|
|
|
|
|
|
|
with mock.patch('swift.obj.replicator.http_connect',
|
|
|
|
mock_http_connect(200)), \
|
2018-03-03 17:07:54 +00:00
|
|
|
mock.patch.object(self.replicator, 'rsync_timeout', 0.01), \
|
|
|
|
mock.patch('eventlet.green.subprocess.Popen', new_mock):
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
self.replicator.run_once()
|
2018-03-03 17:07:54 +00:00
|
|
|
for proc in mock_procs:
|
|
|
|
self.assertEqual(proc._calls, [
|
|
|
|
('wait', 'running'),
|
|
|
|
('kill', 'running'),
|
|
|
|
('wait', 'killed'),
|
2018-04-03 16:40:06 -07:00
|
|
|
('poll', 'killed'),
|
|
|
|
('poll', 'killed'),
|
2018-03-03 17:07:54 +00:00
|
|
|
])
|
|
|
|
self.assertEqual(len(mock_procs), 2)
|
2012-12-17 06:39:25 -05:00
|
|
|
|
2018-04-13 15:08:39 +08:00
|
|
|
def test_limit_rsync_log(self):
|
|
|
|
def do_test(length_limit, log_line, expected):
|
|
|
|
self.replicator.rsync_error_log_line_length = length_limit
|
|
|
|
result = self.replicator._limit_rsync_log(log_line)
|
|
|
|
self.assertEqual(result, expected)
|
|
|
|
|
|
|
|
tests = [{'length_limit': 20,
|
|
|
|
'log_line': 'a' * 20,
|
|
|
|
'expected': 'a' * 20},
|
|
|
|
{'length_limit': 20,
|
|
|
|
'log_line': 'a' * 19,
|
|
|
|
'expected': 'a' * 19},
|
|
|
|
{'length_limit': 20,
|
|
|
|
'log_line': 'a' * 21,
|
|
|
|
'expected': 'a' * 20},
|
|
|
|
{'length_limit': None,
|
|
|
|
'log_line': 'a' * 50,
|
|
|
|
'expected': 'a' * 50},
|
|
|
|
{'length_limit': 0,
|
|
|
|
'log_line': 'a' * 50,
|
|
|
|
'expected': 'a' * 50}]
|
|
|
|
|
|
|
|
for params in tests:
|
|
|
|
do_test(**params)
|
|
|
|
|
Remove object replicator's lockup detector/mitigator.
Sometimes, an rsync process just won't die. You can send SIGKILL, but
it isn't very effective. This is sometimes seen due to attempted I/O
on a failing disk; with some disks, an rsync process won't die until
Linux finishes the current I/O operation (whether success or failure),
but the disk can't succeed and will retry forever instead of
failing. The net effect is an unkillable rsync process.
The replicator was dealing with this by sending SIGKILL to any rsync
that ran too long, then calling waitpid() in a loop[1] until the rsync
died so it could reap the child process. This worked pretty well
unless it met an unkillable rsync; in that case, one greenthread would
end up blocked for a very long time. Since the replicator's main loop
works by (a) gathering all replication jobs, (b) performing them in
parallel with some limited concurrency, then (c) waiting for all jobs
to complete, an unkillable rsync would block the entire replicator.
There was an attempt to address this by adding a lockup detector: if
the replicator failed to complete any replication cycle in N seconds
[2], all greenthreads except the main one would be terminated and the
replication cycle restarted. It works okay, but only handles total
failure. If you have 20 greenthreads working and 19 of them are
blocked on unkillable rsyncs, then as long as the 20th greenthread
manages to replicate at least one partition every N seconds, the
replicator will just keep limping along.
This commit removes the lockup detector. Instead, when a replicator
greenthread happens upon an rsync that doesn't die promptly after
receiving SIGKILL, the process handle is sent to a background
greenthread; that background greenthread simply waits for those rsync
processes to finally die and reaps them. This lets the replicator make
better progress in the presence of unkillable rsyncs.
[1] It's a call to subprocess.Popen.wait(); the looping and sleeping
happens in eventlet.
[2] The default is 1800 seconds = 30 minutes, but the value is
configurable.
Change-Id: If6dc7b003e18ab4e8a5ed687c965025ebd417dfa
2018-03-12 17:58:23 -07:00
|
|
|
|
Multiprocess object replicator
Add a multiprocess mode to the object replicator. Setting the
"replicator_workers" setting to a positive value N will result in the
replicator using up to N worker processes to perform replication
tasks.
At most one worker per disk will be spawned, so one can set
replicator_workers=99999999 to always get one worker per disk
regardless of the number of disks in each node. This is the same
behavior that the object reconstructor has.
Worker process logs will have a bit of information prepended so
operators can tell which messages came from which worker. It looks
like this:
[worker 1/2 pid=16529] 154/154 (100.00%) partitions replicated in 1.02s (150.87/sec, 0s remaining)
The prefix is "[worker M/N pid=P] ", where M is the worker's index, N
is the total number of workers, and P is the process ID. Every message
from the replicator's logger will have the prefix; this includes
messages from down in diskfile, but does not include things printed to
stdout or stderr.
Drive-by fix: don't dump recon stats when replicating only certain
policies. When running the object replicator with replicator_workers >
0 and "--policies=X,Y,Z", the replicator would update recon stats
after running. Since it only ran on a subset of objects, it should not
update recon, much like it doesn't update recon when run with
--devices or --partitions.
Change-Id: I6802a9ad9f1f9b9dafb99d8b095af0fdbf174dc5
2018-03-22 17:08:48 -07:00
|
|
|
@patch_policies([StoragePolicy(0, 'zero', False),
|
|
|
|
StoragePolicy(1, 'one', True)])
|
|
|
|
class TestMultiProcessReplicator(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
|
|
# recon cache path
|
|
|
|
self.recon_cache = tempfile.mkdtemp()
|
|
|
|
rmtree(self.recon_cache, ignore_errors=1)
|
|
|
|
os.mkdir(self.recon_cache)
|
|
|
|
self.recon_file = os.path.join(self.recon_cache, 'object.recon')
|
|
|
|
|
|
|
|
bind_port = 6200
|
|
|
|
|
|
|
|
# Set up some rings
|
|
|
|
self.testdir = tempfile.mkdtemp()
|
|
|
|
_create_test_rings(self.testdir, devs=[
|
|
|
|
{'id': 0, 'device': 'sda', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '127.0.0.1', 'port': bind_port},
|
|
|
|
{'id': 1, 'device': 'sdb', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '127.0.0.1', 'port': bind_port},
|
|
|
|
{'id': 2, 'device': 'sdc', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '127.0.0.1', 'port': bind_port},
|
|
|
|
{'id': 3, 'device': 'sdd', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '127.0.0.1', 'port': bind_port},
|
|
|
|
{'id': 4, 'device': 'sde', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '127.0.0.1', 'port': bind_port},
|
|
|
|
{'id': 100, 'device': 'notme0', 'zone': 0,
|
|
|
|
'region': 1, 'ip': '127.99.99.99', 'port': bind_port}])
|
|
|
|
|
|
|
|
self.logger = debug_logger('test-replicator')
|
|
|
|
self.conf = dict(
|
|
|
|
bind_ip='127.0.0.1', bind_port=bind_port,
|
|
|
|
swift_dir=self.testdir,
|
|
|
|
mount_check='false', recon_cache_path=self.recon_cache,
|
|
|
|
timeout='300', stats_interval='1', sync_method='rsync')
|
|
|
|
|
|
|
|
self.replicator = object_replicator.ObjectReplicator(
|
|
|
|
self.conf, logger=self.logger)
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
self.assertFalse(process_errors)
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
rmtree(self.recon_cache, ignore_errors=1)
|
|
|
|
|
|
|
|
def fake_replicate(self, override_devices, **kw):
|
|
|
|
# Faked-out replicate() method. Just updates the stats, but doesn't
|
|
|
|
# do any work.
|
|
|
|
for device in override_devices:
|
|
|
|
stats = self.replicator.stats_for_dev[device]
|
|
|
|
if device == 'sda':
|
|
|
|
stats.attempted = 1
|
|
|
|
stats.success = 10
|
|
|
|
stats.failure = 100
|
|
|
|
stats.hashmatch = 1000
|
|
|
|
stats.rsync = 10000
|
|
|
|
stats.remove = 100000
|
|
|
|
stats.suffix_count = 1000000
|
|
|
|
stats.suffix_hash = 10000000
|
|
|
|
stats.suffix_sync = 100000000
|
|
|
|
stats.failure_nodes = {
|
|
|
|
'10.1.1.1': {'d11': 1}}
|
|
|
|
elif device == 'sdb':
|
|
|
|
stats.attempted = 2
|
|
|
|
stats.success = 20
|
|
|
|
stats.failure = 200
|
|
|
|
stats.hashmatch = 2000
|
|
|
|
stats.rsync = 20000
|
|
|
|
stats.remove = 200000
|
|
|
|
stats.suffix_count = 2000000
|
|
|
|
stats.suffix_hash = 20000000
|
|
|
|
stats.suffix_sync = 200000000
|
|
|
|
stats.failure_nodes = {
|
|
|
|
'10.2.2.2': {'d22': 2}}
|
|
|
|
elif device == 'sdc':
|
|
|
|
stats.attempted = 3
|
|
|
|
stats.success = 30
|
|
|
|
stats.failure = 300
|
|
|
|
stats.hashmatch = 3000
|
|
|
|
stats.rsync = 30000
|
|
|
|
stats.remove = 300000
|
|
|
|
stats.suffix_count = 3000000
|
|
|
|
stats.suffix_hash = 30000000
|
|
|
|
stats.suffix_sync = 300000000
|
|
|
|
stats.failure_nodes = {
|
|
|
|
'10.3.3.3': {'d33': 3}}
|
|
|
|
elif device == 'sdd':
|
|
|
|
stats.attempted = 4
|
|
|
|
stats.success = 40
|
|
|
|
stats.failure = 400
|
|
|
|
stats.hashmatch = 4000
|
|
|
|
stats.rsync = 40000
|
|
|
|
stats.remove = 400000
|
|
|
|
stats.suffix_count = 4000000
|
|
|
|
stats.suffix_hash = 40000000
|
|
|
|
stats.suffix_sync = 400000000
|
|
|
|
stats.failure_nodes = {
|
|
|
|
'10.4.4.4': {'d44': 4}}
|
|
|
|
elif device == 'sde':
|
|
|
|
stats.attempted = 5
|
|
|
|
stats.success = 50
|
|
|
|
stats.failure = 500
|
|
|
|
stats.hashmatch = 5000
|
|
|
|
stats.rsync = 50000
|
|
|
|
stats.remove = 500000
|
|
|
|
stats.suffix_count = 5000000
|
|
|
|
stats.suffix_hash = 50000000
|
|
|
|
stats.suffix_sync = 500000000
|
|
|
|
stats.failure_nodes = {
|
|
|
|
'10.5.5.5': {'d55': 5}}
|
|
|
|
else:
|
|
|
|
raise Exception("mock can't handle %r" % device)
|
|
|
|
|
|
|
|
def test_no_multiprocessing(self):
|
|
|
|
self.replicator.replicator_workers = 0
|
|
|
|
self.assertEqual(self.replicator.get_worker_args(), [])
|
|
|
|
|
|
|
|
def test_device_distribution(self):
|
|
|
|
self.replicator.replicator_workers = 2
|
|
|
|
self.assertEqual(self.replicator.get_worker_args(), [{
|
|
|
|
'override_devices': ['sda', 'sdc', 'sde'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdb', 'sdd'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}])
|
|
|
|
|
|
|
|
def test_override_policies(self):
|
|
|
|
self.replicator.replicator_workers = 2
|
|
|
|
args = self.replicator.get_worker_args(policies="3,5,7", once=True)
|
|
|
|
self.assertEqual(args, [{
|
|
|
|
'override_devices': ['sda', 'sdc', 'sde'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [3, 5, 7],
|
|
|
|
'have_overrides': True,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdb', 'sdd'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [3, 5, 7],
|
|
|
|
'have_overrides': True,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}])
|
|
|
|
|
|
|
|
# override policies don't apply in run-forever mode
|
|
|
|
args = self.replicator.get_worker_args(policies="3,5,7", once=False)
|
|
|
|
self.assertEqual(args, [{
|
|
|
|
'override_devices': ['sda', 'sdc', 'sde'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdb', 'sdd'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}])
|
|
|
|
|
|
|
|
def test_more_workers_than_disks(self):
|
|
|
|
self.replicator.replicator_workers = 999
|
|
|
|
self.assertEqual(self.replicator.get_worker_args(), [{
|
|
|
|
'override_devices': ['sda'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdb'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdc'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 2,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdd'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 3,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sde'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 4,
|
|
|
|
}])
|
|
|
|
|
|
|
|
# Remember how many workers we actually have so that the log-line
|
|
|
|
# prefixes are reasonable. Otherwise, we'd have five workers, each
|
|
|
|
# logging lines starting with things like "[worker X/999 pid=P]"
|
|
|
|
# despite there being only five.
|
|
|
|
self.assertEqual(self.replicator.replicator_workers, 5)
|
|
|
|
|
|
|
|
def test_command_line_overrides(self):
|
|
|
|
self.replicator.replicator_workers = 2
|
|
|
|
|
|
|
|
args = self.replicator.get_worker_args(
|
|
|
|
devices="sda,sdc,sdd", partitions="12,34,56", once=True)
|
|
|
|
self.assertEqual(args, [{
|
|
|
|
'override_devices': ['sda', 'sdd'],
|
|
|
|
'override_partitions': [12, 34, 56],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': True,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdc'],
|
|
|
|
'override_partitions': [12, 34, 56],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': True,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}])
|
|
|
|
|
|
|
|
args = self.replicator.get_worker_args(
|
|
|
|
devices="sda,sdc,sdd", once=True)
|
|
|
|
self.assertEqual(args, [{
|
|
|
|
'override_devices': ['sda', 'sdd'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': True,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdc'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': True,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}])
|
|
|
|
|
|
|
|
# no overrides apply in run-forever mode
|
|
|
|
args = self.replicator.get_worker_args(
|
|
|
|
devices="sda,sdc,sdd", partitions="12,34,56", once=False)
|
|
|
|
self.assertEqual(args, [{
|
|
|
|
'override_devices': ['sda', 'sdc', 'sde'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 0,
|
|
|
|
}, {
|
|
|
|
'override_devices': ['sdb', 'sdd'],
|
|
|
|
'override_partitions': [],
|
|
|
|
'override_policies': [],
|
|
|
|
'have_overrides': False,
|
|
|
|
'multiprocess_worker_index': 1,
|
|
|
|
}])
|
|
|
|
|
|
|
|
def test_worker_logging(self):
|
|
|
|
self.replicator.replicator_workers = 3
|
|
|
|
|
|
|
|
def log_some_stuff(*a, **kw):
|
|
|
|
self.replicator.logger.debug("debug message")
|
|
|
|
self.replicator.logger.info("info message")
|
|
|
|
self.replicator.logger.warning("warning message")
|
|
|
|
self.replicator.logger.error("error message")
|
|
|
|
|
|
|
|
with mock.patch.object(self.replicator, 'replicate', log_some_stuff), \
|
|
|
|
mock.patch("os.getpid", lambda: 8804):
|
|
|
|
self.replicator.get_worker_args()
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=0,
|
|
|
|
override_devices=['sda', 'sdb'])
|
|
|
|
|
|
|
|
prefix = "[worker 1/3 pid=8804] "
|
|
|
|
for level, lines in self.logger.logger.all_log_lines().items():
|
|
|
|
for line in lines:
|
|
|
|
self.assertTrue(
|
|
|
|
line.startswith(prefix),
|
|
|
|
"%r doesn't start with %r (level %s)" % (
|
|
|
|
line, prefix, level))
|
|
|
|
|
|
|
|
def test_recon_run_once(self):
|
|
|
|
self.replicator.replicator_workers = 3
|
|
|
|
|
|
|
|
the_time = [1521680000]
|
|
|
|
|
|
|
|
def mock_time():
|
|
|
|
rv = the_time[0]
|
|
|
|
the_time[0] += 120
|
|
|
|
return rv
|
|
|
|
|
|
|
|
# Simulate a couple child processes
|
|
|
|
with mock.patch.object(self.replicator, 'replicate',
|
|
|
|
self.fake_replicate), \
|
|
|
|
mock.patch('time.time', mock_time):
|
|
|
|
self.replicator.get_worker_args()
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=0,
|
|
|
|
override_devices=['sda', 'sdb'])
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=1,
|
|
|
|
override_devices=['sdc'])
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=2,
|
|
|
|
override_devices=['sdd', 'sde'])
|
|
|
|
|
|
|
|
with open(self.recon_file) as fh:
|
|
|
|
recon_data = json.load(fh)
|
|
|
|
self.assertIn('object_replication_per_disk', recon_data)
|
|
|
|
self.assertIn('sda', recon_data['object_replication_per_disk'])
|
|
|
|
self.assertIn('sdb', recon_data['object_replication_per_disk'])
|
|
|
|
self.assertIn('sdc', recon_data['object_replication_per_disk'])
|
|
|
|
self.assertIn('sdd', recon_data['object_replication_per_disk'])
|
|
|
|
self.assertIn('sde', recon_data['object_replication_per_disk'])
|
|
|
|
sda = recon_data['object_replication_per_disk']['sda']
|
|
|
|
|
|
|
|
# Spot-check a couple of fields
|
|
|
|
self.assertEqual(sda['replication_stats']['attempted'], 1)
|
|
|
|
self.assertEqual(sda['replication_stats']['success'], 10)
|
|
|
|
self.assertEqual(sda['object_replication_time'], 2) # minutes
|
|
|
|
self.assertEqual(sda['object_replication_last'], 1521680120)
|
|
|
|
|
|
|
|
# Aggregate the workers' recon updates
|
|
|
|
self.replicator.post_multiprocess_run()
|
|
|
|
with open(self.recon_file) as fh:
|
|
|
|
recon_data = json.load(fh)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['attempted'], 15)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['failure'], 1500)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['hashmatch'], 15000)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['remove'], 1500000)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['rsync'], 150000)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['success'], 150)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['suffix_count'],
|
|
|
|
15000000)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['suffix_hash'],
|
|
|
|
150000000)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['suffix_sync'],
|
|
|
|
1500000000)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['failure_nodes'], {
|
|
|
|
'10.1.1.1': {'d11': 1},
|
|
|
|
'10.2.2.2': {'d22': 2},
|
|
|
|
'10.3.3.3': {'d33': 3},
|
|
|
|
'10.4.4.4': {'d44': 4},
|
|
|
|
'10.5.5.5': {'d55': 5},
|
|
|
|
})
|
|
|
|
self.assertEqual(recon_data['object_replication_time'], 2) # minutes
|
|
|
|
self.assertEqual(recon_data['object_replication_last'], 1521680120)
|
|
|
|
|
|
|
|
def test_recon_skipped_with_overrides(self):
|
|
|
|
self.replicator.replicator_workers = 3
|
|
|
|
|
|
|
|
the_time = [1521680000]
|
|
|
|
|
|
|
|
def mock_time():
|
|
|
|
rv = the_time[0]
|
|
|
|
the_time[0] += 120
|
|
|
|
return rv
|
|
|
|
|
|
|
|
with mock.patch.object(self.replicator, 'replicate',
|
|
|
|
self.fake_replicate), \
|
|
|
|
mock.patch('time.time', mock_time):
|
|
|
|
self.replicator.get_worker_args()
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=0,
|
|
|
|
have_overrides=True,
|
|
|
|
override_devices=['sda', 'sdb'])
|
|
|
|
self.assertFalse(os.path.exists(self.recon_file))
|
|
|
|
|
|
|
|
# have_overrides=False makes us get recon stats
|
|
|
|
with mock.patch.object(self.replicator, 'replicate',
|
|
|
|
self.fake_replicate), \
|
|
|
|
mock.patch('time.time', mock_time):
|
|
|
|
self.replicator.get_worker_args()
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=0,
|
|
|
|
have_overrides=False,
|
|
|
|
override_devices=['sda', 'sdb'])
|
|
|
|
with open(self.recon_file) as fh:
|
|
|
|
recon_data = json.load(fh)
|
|
|
|
self.assertIn('sda', recon_data['object_replication_per_disk'])
|
|
|
|
|
|
|
|
def test_recon_run_forever(self):
|
|
|
|
the_time = [1521521521.52152]
|
|
|
|
|
|
|
|
def mock_time():
|
|
|
|
rv = the_time[0]
|
|
|
|
the_time[0] += 120
|
|
|
|
return rv
|
|
|
|
|
|
|
|
self.replicator.replicator_workers = 2
|
|
|
|
self.replicator._next_rcache_update = the_time[0]
|
|
|
|
|
|
|
|
# One worker has finished a pass, the other hasn't.
|
|
|
|
with mock.patch.object(self.replicator, 'replicate',
|
|
|
|
self.fake_replicate), \
|
|
|
|
mock.patch('time.time', mock_time):
|
|
|
|
self.replicator.get_worker_args()
|
|
|
|
# Yes, this says run_once, but this is only to populate
|
|
|
|
# object.recon with some stats. The real test is for the
|
|
|
|
# aggregation.
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=0,
|
|
|
|
override_devices=['sda', 'sdb', 'sdc'])
|
|
|
|
|
|
|
|
# This will not produce aggregate stats since not every device has
|
|
|
|
# finished a pass.
|
|
|
|
the_time[0] += self.replicator.stats_interval
|
|
|
|
with mock.patch('time.time', mock_time):
|
|
|
|
rv = self.replicator.is_healthy()
|
|
|
|
self.assertTrue(rv)
|
|
|
|
with open(self.recon_file) as fh:
|
|
|
|
recon_data = json.load(fh)
|
|
|
|
self.assertNotIn('replication_stats', recon_data)
|
|
|
|
|
|
|
|
# Now all the local devices have completed a replication pass, so we
|
|
|
|
# will produce aggregate stats.
|
|
|
|
with mock.patch.object(self.replicator, 'replicate',
|
|
|
|
self.fake_replicate), \
|
|
|
|
mock.patch('time.time', mock_time):
|
|
|
|
self.replicator.get_worker_args()
|
|
|
|
self.replicator.run_once(multiprocess_worker_index=1,
|
|
|
|
override_devices=['sdd', 'sde'])
|
|
|
|
the_time[0] += self.replicator.stats_interval
|
|
|
|
with mock.patch('time.time', mock_time):
|
|
|
|
rv = self.replicator.is_healthy()
|
|
|
|
self.assertTrue(rv)
|
|
|
|
with open(self.recon_file) as fh:
|
|
|
|
recon_data = json.load(fh)
|
|
|
|
self.assertIn('replication_stats', recon_data)
|
|
|
|
|
|
|
|
# no need to exhaustively check every sum
|
|
|
|
self.assertEqual(recon_data['replication_stats']['attempted'], 15)
|
|
|
|
self.assertEqual(recon_data['replication_stats']['success'], 150)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
recon_data['replication_last'],
|
|
|
|
min(pd['replication_last']
|
|
|
|
for pd in recon_data['object_replication_per_disk'].values()))
|
|
|
|
|
|
|
|
|
|
|
|
class TestReplicatorStats(unittest.TestCase):
|
|
|
|
def test_to_recon(self):
|
|
|
|
st = object_replicator.Stats(
|
|
|
|
attempted=1, failure=2, hashmatch=3, remove=4,
|
|
|
|
rsync=5, success=7,
|
|
|
|
suffix_count=8, suffix_hash=9, suffix_sync=10,
|
|
|
|
failure_nodes={'10.1.2.3': {'sda': 100, 'sdb': 200}})
|
|
|
|
# This is what appears in the recon dump
|
|
|
|
self.assertEqual(st.to_recon(), {
|
|
|
|
'attempted': 1,
|
|
|
|
'failure': 2,
|
|
|
|
'hashmatch': 3,
|
|
|
|
'remove': 4,
|
|
|
|
'rsync': 5,
|
|
|
|
'success': 7,
|
|
|
|
'suffix_count': 8,
|
|
|
|
'suffix_hash': 9,
|
|
|
|
'suffix_sync': 10,
|
|
|
|
'failure_nodes': {'10.1.2.3': {'sda': 100, 'sdb': 200}},
|
|
|
|
})
|
|
|
|
|
|
|
|
def test_recon_roundtrip(self):
|
|
|
|
before = object_replicator.Stats(
|
|
|
|
attempted=1, failure=2, hashmatch=3, remove=4,
|
|
|
|
rsync=5, success=7,
|
|
|
|
suffix_count=8, suffix_hash=9, suffix_sync=10,
|
|
|
|
failure_nodes={'10.1.2.3': {'sda': 100, 'sdb': 200}})
|
|
|
|
after = object_replicator.Stats.from_recon(before.to_recon())
|
|
|
|
self.assertEqual(after.attempted, before.attempted)
|
|
|
|
self.assertEqual(after.failure, before.failure)
|
|
|
|
self.assertEqual(after.hashmatch, before.hashmatch)
|
|
|
|
self.assertEqual(after.remove, before.remove)
|
|
|
|
self.assertEqual(after.rsync, before.rsync)
|
|
|
|
self.assertEqual(after.success, before.success)
|
|
|
|
self.assertEqual(after.suffix_count, before.suffix_count)
|
|
|
|
self.assertEqual(after.suffix_hash, before.suffix_hash)
|
|
|
|
self.assertEqual(after.suffix_sync, before.suffix_sync)
|
|
|
|
self.assertEqual(after.failure_nodes, before.failure_nodes)
|
|
|
|
|
|
|
|
def test_from_recon_skips_extra_fields(self):
|
|
|
|
# If another attribute ever sneaks its way in, we should ignore it.
|
|
|
|
# This will make aborted upgrades a little less painful for
|
|
|
|
# operators.
|
|
|
|
recon_dict = {'attempted': 1, 'failure': 2, 'hashmatch': 3,
|
|
|
|
'spices': 5, 'treasures': 8}
|
|
|
|
stats = object_replicator.Stats.from_recon(recon_dict)
|
|
|
|
self.assertEqual(stats.attempted, 1)
|
|
|
|
self.assertEqual(stats.failure, 2)
|
|
|
|
self.assertEqual(stats.hashmatch, 3)
|
|
|
|
# We don't gain attributes just because they're in object.recon.
|
|
|
|
self.assertFalse(hasattr(stats, 'spices'))
|
|
|
|
self.assertFalse(hasattr(stats, 'treasures'))
|
|
|
|
|
|
|
|
def test_add_failure_stats(self):
|
|
|
|
st = object_replicator.Stats()
|
|
|
|
st.add_failure_stats([('10.1.1.1', 'd10'), ('10.1.1.1', 'd11')])
|
|
|
|
st.add_failure_stats([('10.1.1.1', 'd10')])
|
|
|
|
st.add_failure_stats([('10.1.1.1', 'd12'), ('10.2.2.2', 'd20'),
|
|
|
|
('10.2.2.2', 'd21'), ('10.2.2.2', 'd21'),
|
|
|
|
('10.2.2.2', 'd21')])
|
|
|
|
self.assertEqual(st.failure, 8)
|
|
|
|
|
|
|
|
as_dict = st.to_recon()
|
|
|
|
self.assertEqual(as_dict['failure_nodes'], {
|
|
|
|
'10.1.1.1': {
|
|
|
|
'd10': 2,
|
|
|
|
'd11': 1,
|
|
|
|
'd12': 1,
|
|
|
|
},
|
|
|
|
'10.2.2.2': {
|
|
|
|
'd20': 1,
|
|
|
|
'd21': 3,
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
def test_add(self):
|
|
|
|
st1 = object_replicator.Stats(
|
|
|
|
attempted=1, failure=2, hashmatch=3, remove=4, rsync=5,
|
|
|
|
success=6, suffix_count=7, suffix_hash=8, suffix_sync=9,
|
|
|
|
failure_nodes={
|
|
|
|
'10.1.1.1': {'sda': 10, 'sdb': 20},
|
|
|
|
'10.1.1.2': {'sda': 10, 'sdb': 20}})
|
|
|
|
st2 = object_replicator.Stats(
|
|
|
|
attempted=2, failure=4, hashmatch=6, remove=8, rsync=10,
|
|
|
|
success=12, suffix_count=14, suffix_hash=16, suffix_sync=18,
|
|
|
|
failure_nodes={
|
|
|
|
'10.1.1.2': {'sda': 10, 'sdb': 20},
|
|
|
|
'10.1.1.3': {'sda': 10, 'sdb': 20}})
|
|
|
|
total = st1 + st2
|
|
|
|
self.assertEqual(total.attempted, 3)
|
|
|
|
self.assertEqual(total.failure, 6)
|
|
|
|
self.assertEqual(total.hashmatch, 9)
|
|
|
|
self.assertEqual(total.remove, 12)
|
|
|
|
self.assertEqual(total.rsync, 15)
|
|
|
|
self.assertEqual(total.success, 18)
|
|
|
|
self.assertEqual(total.suffix_count, 21)
|
|
|
|
self.assertEqual(total.suffix_hash, 24)
|
|
|
|
self.assertEqual(total.suffix_sync, 27)
|
|
|
|
self.assertEqual(total.failure_nodes, {
|
|
|
|
'10.1.1.1': {'sda': 10, 'sdb': 20},
|
|
|
|
'10.1.1.2': {'sda': 20, 'sdb': 40},
|
|
|
|
'10.1.1.3': {'sda': 10, 'sdb': 20},
|
|
|
|
})
|
|
|
|
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|