Merge "Quarantine stale EC fragments after checking handoffs"

This commit is contained in:
Zuul 2021-05-11 22:44:16 +00:00 committed by Gerrit Code Review
commit 5ec3826246
13 changed files with 1239 additions and 142 deletions

View File

@ -390,6 +390,39 @@ use = egg:swift#recon
# the environment (default). For more information, see
# https://bugs.launchpad.net/liberasurecode/+bug/1886088
# write_legacy_ec_crc =
#
# When attempting to reconstruct a missing fragment on another node from a
# fragment on the local node, the reconstructor may fail to fetch sufficient
# fragments to reconstruct the missing fragment. This may be because most or
# all of the remote fragments have been deleted, and the local fragment is
# stale, in which case the reconstructor will never succeed in reconstructing
# the apparently missing fragment and will log errors. If the object's
# tombstones have been reclaimed then the stale fragment will never be deleted
# (see https://bugs.launchpad.net/swift/+bug/1655608). If an operator suspects
# that stale fragments have been re-introduced to the cluster and is seeing
# error logs similar to those in the bug report, then the quarantine_threshold
# option may be set to a value greater than zero. This enables the
# reconstructor to quarantine the stale fragments when it fails to fetch more
# than the quarantine_threshold number of fragments (including the stale
# fragment) during an attempt to reconstruct. For example, setting the
# quarantine_threshold to 1 would cause a fragment to be quarantined if no
# other fragments can be fetched. The value may be reset to zero after the
# reconstructor has run on all affected nodes and the error logs are no longer
# seen.
# Note: the quarantine_threshold applies equally to all policies, but for each
# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never
# quarantined when sufficient fragments exist to reconstruct the object.
# Fragments are not quarantined until they are older than the reclaim_age.
# quarantine_threshold = 0
#
# Sets the maximum number of nodes to which requests will be made before
# quarantining a fragment. You can use '* replicas' at the end to have it use
# the number given times the number of replicas for the ring being used for the
# requests. The minimum number of nodes to which requests are made is the
# number of replicas for the policy minus 1 (the node on which the fragment is
# to be rebuilt). The minimum is only exceeded if request_node_count is
# greater, and only for the purposes of quarantining.
# request_node_count = 2 * replicas
[object-updater]
# You can override the default log routing for this app here (don't use set!):

View File

@ -220,8 +220,8 @@ use = egg:swift#proxy
# to use up to replica count threads when waiting on a response. In
# conjunction with the concurrency_timeout option this will allow swift to send
# out GET/HEAD requests to the storage nodes concurrently and answer as soon as
# the minimum number of backend responses are availabe - in replicated contexts
# this will be the first backend replica to respond.
# the minimum number of backend responses are available - in replicated
# contexts this will be the first backend replica to respond.
# concurrent_gets = off
#
# This parameter controls how long to wait before firing off the next

View File

@ -515,6 +515,23 @@ def config_percent_value(value):
raise ValueError("%s: %s" % (str(err), value))
def config_request_node_count_value(value):
try:
value_parts = value.lower().split()
rnc_value = int(value_parts[0])
except (ValueError, AttributeError):
pass
else:
if len(value_parts) == 1:
return lambda replicas: rnc_value
elif (len(value_parts) == 3 and
value_parts[1] == '*' and
value_parts[2] == 'replicas'):
return lambda replicas: rnc_value * replicas
raise ValueError(
'Invalid request_node_count value: %r' % value)
def append_underscore(prefix):
if prefix and not prefix.endswith('_'):
prefix += '_'

View File

@ -12,7 +12,7 @@
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import json
import errno
import os
@ -33,7 +33,8 @@ from swift.common.utils import (
dump_recon_cache, mkdirs, config_true_value,
GreenAsyncPile, Timestamp, remove_file,
load_recon_cache, parse_override_options, distribute_evenly,
PrefixLoggerAdapter, remove_directory)
PrefixLoggerAdapter, remove_directory, config_request_node_count_value,
non_negative_int)
from swift.common.header_key_dict import HeaderKeyDict
from swift.common.bufferedhttp import http_connect
from swift.common.daemon import Daemon
@ -102,8 +103,8 @@ class ResponseBucket(object):
def __init__(self):
# count of all responses associated with this Bucket
self.num_responses = 0
# map {frag_index: response} for subset of responses that
# could be used to rebuild the missing fragment
# map {frag_index: response} for subset of responses that could be used
# to rebuild the missing fragment
self.useful_responses = {}
# set if a durable timestamp was seen in responses
self.durable = False
@ -232,6 +233,10 @@ class ObjectReconstructor(Daemon):
'of handoffs_only.')
self.rebuild_handoff_node_count = int(conf.get(
'rebuild_handoff_node_count', 2))
self.quarantine_threshold = non_negative_int(
conf.get('quarantine_threshold', 0))
self.request_node_count = config_request_node_count_value(
conf.get('request_node_count', '2 * replicas'))
# When upgrading from liberasurecode<=1.5.0, you may want to continue
# writing legacy CRCs until all nodes are upgraded and capabale of
@ -368,23 +373,24 @@ class ObjectReconstructor(Daemon):
return False
return True
def _get_response(self, node, part, path, headers, full_path):
def _get_response(self, node, policy, partition, path, headers):
"""
Helper method for reconstruction that GETs a single EC fragment
archive
:param node: the node to GET from
:param part: the partition
:param policy: the job policy
:param partition: the partition
:param path: path of the desired EC archive relative to partition dir
:param headers: the headers to send
:param full_path: full path to desired EC archive
:returns: response
"""
full_path = _full_path(node, partition, path, policy)
resp = None
try:
with ConnectionTimeout(self.conn_timeout):
conn = http_connect(node['ip'], node['port'], node['device'],
part, 'GET', path, headers=headers)
partition, 'GET', path, headers=headers)
with Timeout(self.node_timeout):
resp = conn.getresponse()
resp.full_path = full_path
@ -462,7 +468,11 @@ class ObjectReconstructor(Daemon):
fi_to_rebuild)
return None
if fi_to_rebuild == resp_frag_index:
durable_timestamp = resp.headers.get('X-Backend-Durable-Timestamp')
if durable_timestamp:
buckets[Timestamp(durable_timestamp)].durable = True
if resp_frag_index == fi_to_rebuild:
# TODO: With duplicated EC frags it's not unreasonable to find the
# very fragment we're trying to rebuild exists on another primary
# node. In this case we should stream it directly from the remote
@ -471,19 +481,43 @@ class ObjectReconstructor(Daemon):
'Found existing frag #%s at %s while rebuilding to %s',
fi_to_rebuild, resp.full_path,
_full_path(node, partition, path, policy))
return None
durable_timestamp = resp.headers.get('X-Backend-Durable-Timestamp')
if durable_timestamp:
buckets[Timestamp(durable_timestamp)].durable = True
if resp_frag_index not in bucket.useful_responses:
elif resp_frag_index not in bucket.useful_responses:
bucket.useful_responses[resp_frag_index] = resp
return bucket
return None
# else: duplicate frag_index isn't useful for rebuilding
def _make_fragment_requests(self, job, node, datafile_metadata, buckets,
error_responses, nodes):
return bucket
def _is_quarantine_candidate(self, policy, buckets, error_responses, df):
# This condition is deliberately strict because it determines if
# more requests will be issued and ultimately if the fragment
# will be quarantined.
if list(error_responses.keys()) != [404]:
# only quarantine if all other responses are 404 so we are
# confident there are no other frags on queried nodes
return False
local_timestamp = Timestamp(df.get_datafile_metadata()['X-Timestamp'])
if list(buckets.keys()) != [local_timestamp]:
# don't quarantine if there's insufficient other timestamp
# frags, or no response for the local frag timestamp: we
# possibly could quarantine, but this unexpected case may be
# worth more investigation
return False
if time.time() - float(local_timestamp) <= df.manager.reclaim_age:
# If the fragment has not yet passed reclaim age then it is
# likely that a tombstone will be reverted to this node, or
# neighbor frags will get reverted from handoffs to *other* nodes
# and we'll discover we *do* have enough to reconstruct. Don't
# quarantine it yet: better that it is cleaned up 'normally'.
return False
bucket = buckets[local_timestamp]
return (bucket.num_responses <= self.quarantine_threshold and
bucket.num_responses < policy.ec_ndata and
df._frag_index in bucket.useful_responses)
def _make_fragment_requests(self, job, node, df, buckets, error_responses):
"""
Issue requests for fragments to the list of ``nodes`` and sort the
responses into per-timestamp ``buckets`` or per-status
@ -492,16 +526,15 @@ class ObjectReconstructor(Daemon):
:param job: job from ssync_sender.
:param node: node to which we're rebuilding.
:param datafile_metadata: the datafile metadata to attach to
the rebuilt fragment archive
:param df: an instance of :class:`~swift.obj.diskfile.BaseDiskFile`.
:param buckets: dict of per-timestamp buckets for ok responses.
:param error_responses: dict of per-status lists of error responses.
:param nodes: A list of nodes.
:return: A per-timestamp with sufficient responses, or None if
there is no such bucket.
"""
policy = job['policy']
partition = job['partition']
datafile_metadata = df.get_datafile_metadata()
# the fragment index we need to reconstruct is the position index
# of the node we're rebuilding to within the primary part list
@ -521,24 +554,80 @@ class ObjectReconstructor(Daemon):
headers['X-Backend-Fragment-Preferences'] = json.dumps(frag_prefs)
path = datafile_metadata['name']
pile = GreenAsyncPile(len(nodes))
for _node in nodes:
full_get_path = _full_path(_node, partition, path, policy)
pile.spawn(self._get_response, _node, partition,
path, headers, full_get_path)
ring = policy.object_ring
primary_nodes = ring.get_part_nodes(partition)
# primary_node_count is the maximum number of nodes to consume in a
# normal rebuild attempt when there is no quarantine candidate,
# including the node to which we are rebuilding
primary_node_count = len(primary_nodes)
# don't try and fetch a fragment from the node we're rebuilding to
filtered_primary_nodes = [n for n in primary_nodes
if n['id'] != node['id']]
# concurrency is the number of requests fired off in initial batch
concurrency = len(filtered_primary_nodes)
# max_node_count is the maximum number of nodes to consume when
# verifying a quarantine candidate and is at least primary_node_count
max_node_count = max(primary_node_count,
self.request_node_count(primary_node_count))
pile = GreenAsyncPile(concurrency)
for primary_node in filtered_primary_nodes:
pile.spawn(self._get_response, primary_node, policy, partition,
path, headers)
useful_bucket = None
for resp in pile:
bucket = self._handle_fragment_response(
node, policy, partition, fi_to_rebuild, path, buckets,
error_responses, resp)
if bucket and len(bucket.useful_responses) >= policy.ec_ndata:
frag_indexes = list(bucket.useful_responses.keys())
self.logger.debug('Reconstruct frag #%s with frag indexes %s'
% (fi_to_rebuild, frag_indexes))
return bucket
return None
useful_bucket = bucket
break
def reconstruct_fa(self, job, node, datafile_metadata):
# Once all rebuild nodes have responded, if we have a quarantine
# candidate, go beyond primary_node_count and on to handoffs. The
# first non-404 response will prevent quarantine, but the expected
# common case is all 404 responses so we use some concurrency to get an
# outcome faster at the risk of some unnecessary requests in the
# uncommon case.
if (not useful_bucket and
self._is_quarantine_candidate(
policy, buckets, error_responses, df)):
node_count = primary_node_count
handoff_iter = itertools.islice(ring.get_more_nodes(partition),
max_node_count - node_count)
for handoff_node in itertools.islice(handoff_iter, concurrency):
node_count += 1
pile.spawn(self._get_response, handoff_node, policy, partition,
path, headers)
for resp in pile:
bucket = self._handle_fragment_response(
node, policy, partition, fi_to_rebuild, path, buckets,
error_responses, resp)
if bucket and len(bucket.useful_responses) >= policy.ec_ndata:
useful_bucket = bucket
self.logger.debug(
'Reconstructing frag from handoffs, node_count=%d'
% node_count)
break
elif self._is_quarantine_candidate(
policy, buckets, error_responses, df):
try:
handoff_node = next(handoff_iter)
node_count += 1
pile.spawn(self._get_response, handoff_node, policy,
partition, path, headers)
except StopIteration:
pass
# else: this frag is no longer a quarantine candidate, so we
# could break right here and ignore any remaining responses,
# but given that we may have actually found another frag we'll
# optimistically wait for any remaining responses in case a
# useful bucket is assembled.
return useful_bucket
def reconstruct_fa(self, job, node, df):
"""
Reconstructs a fragment archive - this method is called from ssync
after a remote node responds that is missing this object - the local
@ -547,8 +636,7 @@ class ObjectReconstructor(Daemon):
:param job: job from ssync_sender.
:param node: node to which we're rebuilding.
:param datafile_metadata: the datafile metadata to attach to
the rebuilt fragment archive
:param df: an instance of :class:`~swift.obj.diskfile.BaseDiskFile`.
:returns: a DiskFile like class for use by ssync.
:raises DiskFileQuarantined: if the fragment archive cannot be
reconstructed and has as a result been quarantined.
@ -559,6 +647,7 @@ class ObjectReconstructor(Daemon):
# the fragment index we need to reconstruct is the position index
# of the node we're rebuilding to within the primary part list
fi_to_rebuild = node['backend_index']
datafile_metadata = df.get_datafile_metadata()
local_timestamp = Timestamp(datafile_metadata['X-Timestamp'])
path = datafile_metadata['name']
@ -566,12 +655,13 @@ class ObjectReconstructor(Daemon):
error_responses = defaultdict(list) # map status code -> response list
# don't try and fetch a fragment from the node we're rebuilding to
part_nodes = [n for n in policy.object_ring.get_part_nodes(partition)
if n['id'] != node['id']]
useful_bucket = self._make_fragment_requests(
job, node, datafile_metadata, buckets, error_responses, part_nodes)
job, node, df, buckets, error_responses)
if useful_bucket:
frag_indexes = list(useful_bucket.useful_responses.keys())
self.logger.debug('Reconstruct frag #%s with frag indexes %s'
% (fi_to_rebuild, frag_indexes))
responses = list(useful_bucket.useful_responses.values())
rebuilt_fragment_iter = self.make_rebuilt_fragment_iter(
responses[:policy.ec_ndata], path, policy, fi_to_rebuild)
@ -601,6 +691,10 @@ class ObjectReconstructor(Daemon):
errors, 'durable' if durable else 'non-durable',
full_path, fi_to_rebuild))
if self._is_quarantine_candidate(policy, buckets, error_responses, df):
raise df._quarantine(
df._data_file, "Solitary fragment #%s" % df._frag_index)
raise DiskFileError('Unable to reconstruct EC archive')
def _reconstruct(self, policy, fragment_payload, frag_index):

View File

@ -374,7 +374,7 @@ class Sender(object):
# from the data file only.
df_alt = self.job.get(
'sync_diskfile_builder', lambda *args: df)(
self.job, self.node, df.get_datafile_metadata())
self.job, self.node, df)
self.send_put(connection, url_path, df_alt,
durable=is_durable)
if want.get('meta') and df.data_timestamp != df.timestamp:

View File

@ -35,7 +35,8 @@ from swift.common.ring import Ring
from swift.common.utils import Watchdog, get_logger, \
get_remote_client, split_path, config_true_value, generate_trans_id, \
affinity_key_function, affinity_locality_predicate, list_from_csv, \
register_swift_info, parse_prefixed_conf, config_auto_int_value
register_swift_info, parse_prefixed_conf, config_auto_int_value, \
config_request_node_count_value
from swift.common.constraints import check_utf8, valid_api_version
from swift.proxy.controllers import AccountController, ContainerController, \
ObjectControllerRouter, InfoController
@ -279,16 +280,8 @@ class Application(object):
conf.get('strict_cors_mode', 't'))
self.node_timings = {}
self.timing_expiry = int(conf.get('timing_expiry', 300))
value = conf.get('request_node_count', '2 * replicas').lower().split()
if len(value) == 1:
rnc_value = int(value[0])
self.request_node_count = lambda replicas: rnc_value
elif len(value) == 3 and value[1] == '*' and value[2] == 'replicas':
rnc_value = int(value[0])
self.request_node_count = lambda replicas: rnc_value * replicas
else:
raise ValueError(
'Invalid request_node_count value: %r' % ''.join(value))
value = conf.get('request_node_count', '2 * replicas')
self.request_node_count = config_request_node_count_value(value)
# swift_owner_headers are stripped by the account and container
# controllers; we should extend header stripping to object controller
# when a privileged object header is implemented.

View File

@ -33,7 +33,7 @@ from six.moves.http_client import HTTPConnection
from six.moves.urllib.parse import urlparse
from swiftclient import get_auth, head_account, client
from swift.common import internal_client, direct_client
from swift.common import internal_client, direct_client, utils
from swift.common.direct_client import DirectClientException
from swift.common.ring import Ring
from swift.common.utils import readconf, renamer, \
@ -41,6 +41,7 @@ from swift.common.utils import readconf, renamer, \
from swift.common.manager import Manager
from swift.common.storage_policy import POLICIES, EC_POLICY, REPL_POLICY
from swift.obj.diskfile import get_data_dir
from test.debug_logger import debug_logger
from test.probe import CHECK_SERVER_TIMEOUT, VALIDATE_RSYNC, PROXY_BASE_URL
@ -186,6 +187,8 @@ def store_config_paths(name, configs):
server_names = [name, '%s-replicator' % name]
if name == 'container':
server_names.append('container-sharder')
elif name == 'object':
server_names.append('object-reconstructor')
for server_name in server_names:
for server in Manager([server_name]):
for i, conf in enumerate(server.conf_files(), 1):
@ -563,6 +566,15 @@ class ProbeTest(unittest.TestCase):
for ent in os.listdir(ap_dir_fullpath)])
return async_pendings
def run_custom_daemon(self, klass, conf_section, conf_index,
custom_conf, **kwargs):
conf_file = self.configs[conf_section][conf_index]
conf = utils.readconf(conf_file, conf_section)
conf.update(custom_conf)
daemon = klass(conf, debug_logger('probe'))
daemon.run_once(**kwargs)
return daemon
class ReplProbeTest(ProbeTest):
@ -679,6 +691,7 @@ class ECProbeTest(ProbeTest):
self.direct_get(onode, opart, require_durable=require_durable)
except direct_client.DirectClientException as err:
self.assertEqual(err.http_status, status)
return err
else:
self.fail('Node data on %r was not fully destroyed!' % (onode,))

View File

@ -13,7 +13,7 @@
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
from contextlib import contextmanager
import unittest
import uuid
@ -22,7 +22,9 @@ import time
import six
from swift.common.direct_client import DirectClientException
from swift.common.manager import Manager
from swift.common.utils import md5
from swift.obj.reconstructor import ObjectReconstructor
from test.probe.common import ECProbeTest
from swift.common import direct_client
@ -369,6 +371,160 @@ class TestReconstructorRebuild(ECProbeTest):
'X-Backend-Storage-Policy-Index': int(self.policy)})
self.assertNotIn('X-Delete-At', headers)
def test_rebuild_quarantines_lonely_frag(self):
# fail one device while the object is deleted so we are left with one
# fragment and some tombstones
failed_node = self.onodes[0]
device_path = self.device_dir(failed_node)
self.kill_drive(device_path)
self.assert_direct_get_fails(failed_node, self.opart, 507) # sanity
# delete object
client.delete_object(self.url, self.token, self.container_name,
self.object_name)
# check we have tombstones
for node in self.onodes[1:]:
err = self.assert_direct_get_fails(node, self.opart, 404)
self.assertIn('X-Backend-Timestamp', err.http_headers)
# run the reconstructor with zero reclaim age to clean up tombstones
for conf_index in self.configs['object-reconstructor'].keys():
self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0'})
# check we no longer have tombstones
for node in self.onodes[1:]:
err = self.assert_direct_get_fails(node, self.opart, 404)
self.assertNotIn('X-Timestamp', err.http_headers)
# revive the failed device and check it has a fragment
self.revive_drive(device_path)
self.assert_direct_get_succeeds(failed_node, self.opart)
# restart proxy to clear error-limiting so that the revived drive
# participates again
Manager(['proxy-server']).restart()
# client GET will fail with 503 ...
with self.assertRaises(ClientException) as cm:
client.get_object(self.url, self.token, self.container_name,
self.object_name)
self.assertEqual(503, cm.exception.http_status)
# ... but client GET succeeds
headers = client.head_object(self.url, self.token, self.container_name,
self.object_name)
for key in self.headers_post:
self.assertIn(key, headers)
self.assertEqual(self.headers_post[key], headers[key])
# run the reconstructor without quarantine_threshold set
error_lines = []
warning_lines = []
for conf_index in self.configs['object-reconstructor'].keys():
reconstructor = self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0'})
logger = reconstructor.logger.logger
error_lines.append(logger.get_lines_for_level('error'))
warning_lines.append(logger.get_lines_for_level('warning'))
# check logs for errors
found_lines = False
for lines in error_lines:
if not lines:
continue
self.assertFalse(found_lines, error_lines)
found_lines = True
for line in itertools.islice(lines, 0, 6, 2):
self.assertIn(
'Unable to get enough responses (1/4 from 1 ok '
'responses)', line, lines)
for line in itertools.islice(lines, 1, 7, 2):
self.assertIn(
'Unable to get enough responses (4 x 404 error '
'responses)', line, lines)
self.assertTrue(found_lines, 'error lines not found')
for lines in warning_lines:
self.assertEqual([], lines)
# check we have still have a single fragment and no tombstones
self.assert_direct_get_succeeds(failed_node, self.opart)
for node in self.onodes[1:]:
err = self.assert_direct_get_fails(node, self.opart, 404)
self.assertNotIn('X-Timestamp', err.http_headers)
# run the reconstructor to quarantine the lonely frag
error_lines = []
warning_lines = []
for conf_index in self.configs['object-reconstructor'].keys():
reconstructor = self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0', 'quarantine_threshold': '1'})
logger = reconstructor.logger.logger
error_lines.append(logger.get_lines_for_level('error'))
warning_lines.append(logger.get_lines_for_level('warning'))
# check logs for errors
found_lines = False
for index, lines in enumerate(error_lines):
if not lines:
continue
self.assertFalse(found_lines, error_lines)
found_lines = True
for line in itertools.islice(lines, 0, 6, 2):
self.assertIn(
'Unable to get enough responses (1/4 from 1 ok '
'responses)', line, lines)
for line in itertools.islice(lines, 1, 7, 2):
self.assertIn(
'Unable to get enough responses (6 x 404 error '
'responses)', line, lines)
self.assertTrue(found_lines, 'error lines not found')
# check logs for quarantine warning
found_lines = False
for lines in warning_lines:
if not lines:
continue
self.assertFalse(found_lines, warning_lines)
found_lines = True
self.assertEqual(1, len(lines), lines)
self.assertIn('Quarantined object', lines[0])
self.assertTrue(found_lines, 'warning lines not found')
# check we have nothing
for node in self.onodes:
err = self.assert_direct_get_fails(node, self.opart, 404)
self.assertNotIn('X-Backend-Timestamp', err.http_headers)
# client HEAD and GET now both 404
with self.assertRaises(ClientException) as cm:
client.get_object(self.url, self.token, self.container_name,
self.object_name)
self.assertEqual(404, cm.exception.http_status)
with self.assertRaises(ClientException) as cm:
client.head_object(self.url, self.token, self.container_name,
self.object_name)
self.assertEqual(404, cm.exception.http_status)
# run the reconstructor once more - should see no errors in logs!
error_lines = []
warning_lines = []
for conf_index in self.configs['object-reconstructor'].keys():
reconstructor = self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0', 'quarantine_threshold': '1'})
logger = reconstructor.logger.logger
error_lines.append(logger.get_lines_for_level('error'))
warning_lines.append(logger.get_lines_for_level('warning'))
for lines in error_lines:
self.assertEqual([], lines)
for lines in warning_lines:
self.assertEqual([], lines)
if six.PY2:
class TestReconstructorRebuildUTF8(TestReconstructorRebuild):

View File

@ -40,7 +40,6 @@ from test.probe import PROXY_BASE_URL
from test.probe.brain import BrainSplitter
from test.probe.common import ReplProbeTest, get_server_number, \
wait_for_server_to_hangup
from test.debug_logger import debug_logger
import mock
@ -415,12 +414,8 @@ class BaseTestContainerSharding(ReplProbeTest):
additional_args='--partitions=%s' % part)
def run_custom_sharder(self, conf_index, custom_conf, **kwargs):
conf_file = self.configs['container-sharder'][conf_index]
conf = utils.readconf(conf_file, 'container-sharder')
conf.update(custom_conf)
sharder = ContainerSharder(conf, logger=debug_logger('probe'))
sharder.run_once(**kwargs)
return sharder
return self.run_custom_daemon(ContainerSharder, 'container-sharder',
conf_index, custom_conf, **kwargs)
class TestContainerShardingNonUTF8(BaseTestContainerSharding):

View File

@ -18,6 +18,7 @@ from __future__ import print_function
import hashlib
from test import annotate_failure
from test.debug_logger import debug_logger
from test.unit import temptree, make_timestamp_iter, with_tempdir, \
mock_timestamp_now, FakeIterable
@ -3156,6 +3157,27 @@ cluster_dfw1 = http://dfw1.host/v1/
'less than 100, not "{}"'.format(val),
cm.exception.args[0])
def test_config_request_node_count_value(self):
def do_test(value, replicas, expected):
self.assertEqual(
expected,
utils.config_request_node_count_value(value)(replicas))
do_test('0', 10, 0)
do_test('1 * replicas', 3, 3)
do_test('1 * replicas', 11, 11)
do_test('2 * replicas', 3, 6)
do_test('2 * replicas', 11, 22)
do_test('11', 11, 11)
do_test('10', 11, 10)
do_test('12', 11, 12)
for bad in ('1.1', 1.1, 'auto', 'bad',
'2.5 * replicas', 'two * replicas'):
with annotate_failure(bad):
with self.assertRaises(ValueError):
utils.config_request_node_count_value(bad)
def test_config_auto_int_value(self):
expectations = {
# (value, default) : expected,

View File

@ -34,7 +34,7 @@ from gzip import GzipFile
from shutil import rmtree
from six.moves.urllib.parse import unquote
from swift.common import utils
from swift.common.exceptions import DiskFileError
from swift.common.exceptions import DiskFileError, DiskFileQuarantined
from swift.common.header_key_dict import HeaderKeyDict
from swift.common.utils import dump_recon_cache, md5
from swift.obj import diskfile, reconstructor as object_reconstructor
@ -42,6 +42,7 @@ from swift.common import ring
from swift.common.storage_policy import (StoragePolicy, ECStoragePolicy,
POLICIES, EC_POLICY)
from swift.obj.reconstructor import SYNC, REVERT
from test import annotate_failure
from test.debug_logger import debug_logger
from test.unit import (patch_policies, mocked_http_conn, FabricatedRing,
@ -785,10 +786,8 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase):
def do_test(stat_code):
with mocked_http_conn(stat_code):
resp = self.reconstructor._get_response(node, part,
path='nada',
headers={},
full_path='nada/nada')
resp = self.reconstructor._get_response(
node, self.policy, part, path='nada', headers={})
return resp
for status in (200, 400, 404, 503):
@ -4528,17 +4527,27 @@ class TestObjectReconstructor(BaseTestObjectReconstructor):
class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
obj_path = b'/a/c/o' # subclass overrides this
obj_name = b'o' # subclass overrides this
def setUp(self):
super(TestReconstructFragmentArchive, self).setUp()
self.obj_path = b'/a/c/' + self.obj_name
self.obj_timestamp = self.ts()
self.obj_metadata = {
'name': self.obj_path,
'Content-Length': '0',
'ETag': 'etag',
'X-Timestamp': self.obj_timestamp.normal
}
def _create_fragment(self, frag_index, body=b'test data'):
utils.mkdirs(os.path.join(self.devices, 'sda1'))
df_mgr = self.reconstructor._df_router[self.policy]
if six.PY2:
obj_name = self.obj_name
else:
obj_name = self.obj_name.decode('utf8')
self.df = df_mgr.get_diskfile('sda1', 9, 'a', 'c', obj_name,
policy=self.policy)
write_diskfile(self.df, self.obj_timestamp, data=body,
frag_index=frag_index)
self.df.open()
self.logger.clear()
return self.df
def test_reconstruct_fa_no_errors(self):
job = {
@ -4565,9 +4574,9 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
called_headers = []
orig_func = object_reconstructor.ObjectReconstructor._get_response
def _get_response_hook(self, node, part, path, headers, policy):
def _get_response_hook(self, node, policy, part, path, headers):
called_headers.append(headers)
return orig_func(self, node, part, path, headers, policy)
return orig_func(self, node, policy, part, path, headers)
codes, body_iter, headers = zip(*responses)
get_response_path = \
@ -4576,7 +4585,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(
*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, self.obj_metadata)
job, node, self._create_fragment(2, body=b''))
self.assertEqual(0, df.content_length)
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
@ -4635,7 +4644,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers_iter):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -4676,7 +4685,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers_iter):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -4722,7 +4731,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers_iter):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
# ... this bad response should be ignored like any other failure
self.assertEqual(len(fixed_body), len(broken_body))
@ -4761,7 +4770,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers_iter):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -4783,7 +4792,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
range(policy.object_ring.replicas - 1)]
with mocked_http_conn(*codes):
self.assertRaises(DiskFileError, self.reconstructor.reconstruct_fa,
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
error_lines = self.logger.get_lines_for_level('error')
# # of replicas failed and one more error log to report not enough
# responses to reconstruct.
@ -4799,6 +4808,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
self.assertFalse(self.logger.get_lines_for_level('warning'))
def test_reconstruct_fa_all_404s_fails(self):
self._create_fragment(2)
job = {
'partition': 0,
'policy': self.policy,
@ -4811,7 +4821,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes = [404 for i in range(policy.object_ring.replicas - 1)]
with mocked_http_conn(*codes):
self.assertRaises(DiskFileError, self.reconstructor.reconstruct_fa,
job, node, self.obj_metadata)
job, node, self.df)
error_lines = self.logger.get_lines_for_level('error')
# only 1 log to report not enough responses
self.assertEqual(1, len(error_lines))
@ -4823,7 +4833,51 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
# no warning
self.assertFalse(self.logger.get_lines_for_level('warning'))
def test_reconstruct_fa_all_404s_fails_custom_request_node_count(self):
# verify that when quarantine_threshold is not set the number of
# requests is capped at replicas - 1 regardless of request_node_count
self._create_fragment(2)
job = {
'partition': 0,
'policy': self.policy,
}
part_nodes = self.policy.object_ring.get_part_nodes(0)
node = part_nodes[1]
node['backend_index'] = self.policy.get_backend_index(node['index'])
ring = self.policy.object_ring
# sanity check: number of handoffs available == replicas
self.assertEqual(ring.max_more_nodes, ring.replicas)
for request_node_count in (0,
self.policy.ec_ndata - 1,
ring.replicas + 1,
2 * ring.replicas - 1,
2 * ring.replicas,
3 * ring.replicas,
99 * ring.replicas):
with annotate_failure(request_node_count):
self.logger.clear()
self.reconstructor.request_node_count = \
lambda replicas: request_node_count
# request count capped at num primaries - 1
exp_requests = ring.replicas - 1
codes = [404 for i in range(exp_requests)]
with mocked_http_conn(*codes):
self.assertRaises(DiskFileError,
self.reconstructor.reconstruct_fa,
job, node, self.df)
error_lines = self.logger.get_lines_for_level('error')
# only 1 log to report not enough responses
self.assertEqual(1, len(error_lines))
self.assertIn(
'Unable to get enough responses (%s x 404 error responses)'
% exp_requests,
error_lines[0],
"Unexpected error line found: %s" % error_lines[0])
# no warning
self.assertFalse(self.logger.get_lines_for_level('warning'))
def test_reconstruct_fa_mixture_of_errors_fails(self):
self._create_fragment(2)
job = {
'partition': 0,
'policy': self.policy,
@ -4839,7 +4893,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
range(policy.object_ring.replicas - 4)]
with mocked_http_conn(*codes):
self.assertRaises(DiskFileError, self.reconstructor.reconstruct_fa,
job, node, self.obj_metadata)
job, node, self.df)
exp_timeouts = len([c for c in codes if isinstance(c, Timeout)])
exp_404s = len([c for c in codes if c == 404])
exp_507s = len([c for c in codes if c == 507])
@ -4901,7 +4955,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -4941,7 +4995,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -4958,7 +5012,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -4995,7 +5049,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -5016,7 +5070,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, dict(self.obj_metadata))
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -5080,7 +5134,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
self.assertRaises(DiskFileError, self.reconstructor.reconstruct_fa,
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
error_lines = self.logger.get_lines_for_level('error')
# 1 error log per etag to report not enough responses
@ -5111,6 +5165,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
self.assertFalse(self.logger.get_lines_for_level('warning'))
def test_reconstruct_fa_with_mixed_etags_same_timestamp_fail(self):
self._create_fragment(2)
job = {
'partition': 0,
'policy': self.policy,
@ -5157,7 +5212,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
self.assertRaises(DiskFileError, self.reconstructor.reconstruct_fa,
job, node, self.obj_metadata)
job, node, self.df)
error_lines = self.logger.get_lines_for_level('error')
self.assertGreater(len(error_lines), 1)
@ -5219,7 +5274,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -5234,7 +5289,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
# redundant frag found once in first ec_ndata responses
self.assertIn(
'Found existing frag #%s at' % broken_index,
debug_log_lines[0])
debug_log_lines[0], debug_log_lines)
# N.B. in the future, we could avoid those check because
# definitely sending the copy rather than reconstruct will
@ -5251,6 +5306,537 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
debug_log_lines[1][len(log_prefix):])
self.assertNotIn(broken_index, got_frag_index_list)
def test_quarantine_threshold_conf(self):
reconstructor = object_reconstructor.ObjectReconstructor({})
self.assertEqual(0, reconstructor.quarantine_threshold)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_threshold': '0'})
self.assertEqual(0, reconstructor.quarantine_threshold)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_threshold': '1'})
self.assertEqual(1, reconstructor.quarantine_threshold)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_threshold': 2.0})
self.assertEqual(2, reconstructor.quarantine_threshold)
for bad in ('1.1', 1.1, '-1', -1, 'auto', 'bad'):
with annotate_failure(bad):
with self.assertRaises(ValueError):
object_reconstructor.ObjectReconstructor(
{'quarantine_threshold': bad})
def test_request_node_count_conf(self):
# default is 1 * replicas
reconstructor = object_reconstructor.ObjectReconstructor({})
self.assertEqual(6, reconstructor.request_node_count(3))
self.assertEqual(22, reconstructor.request_node_count(11))
def do_test(value, replicas, expected):
reconstructor = object_reconstructor.ObjectReconstructor(
{'request_node_count': value})
self.assertEqual(expected,
reconstructor.request_node_count(replicas))
do_test('0', 10, 0)
do_test('1 * replicas', 3, 3)
do_test('1 * replicas', 11, 11)
do_test('2 * replicas', 3, 6)
do_test('2 * replicas', 11, 22)
do_test('11', 11, 11)
do_test('10', 11, 10)
do_test('12', 11, 12)
for bad in ('1.1', 1.1, 'auto', 'bad',
'2.5 * replicas', 'two * replicas'):
with annotate_failure(bad):
with self.assertRaises(ValueError):
object_reconstructor.ObjectReconstructor(
{'request_node_count': bad})
def _do_test_reconstruct_insufficient_frags(
self, extra_conf, num_frags, other_responses,
local_frag_index=2, frag_index_to_rebuild=1,
resp_timestamps=None, resp_etags=None):
# num_frags is number of ok responses, other_responses is bad responses
# By default frag_index_to_rebuild is less than local_frag_index and
# all frag responses have indexes >= local_frag_index
self.assertGreater(num_frags, 0)
self.logger.clear()
self._configure_reconstructor(**extra_conf)
self._create_fragment(local_frag_index)
job = {
'partition': 0,
'policy': self.policy,
}
part_nodes = self.policy.object_ring.get_part_nodes(0)
node = part_nodes[frag_index_to_rebuild]
node['backend_index'] = self.policy.get_backend_index(node['index'])
test_data = (b'rebuild' * self.policy.ec_segment_size)[:-777]
etag = md5(test_data, usedforsecurity=False).hexdigest()
ec_archive_bodies = encode_frag_archive_bodies(self.policy, test_data)
frags = ec_archive_bodies[
local_frag_index:local_frag_index + num_frags]
if resp_etags:
self.assertEqual(len(frags), len(resp_etags))
etags = []
for other_etag in resp_etags:
# use default etag where other_etag is None
etags.append(other_etag if other_etag else etag)
else:
etags = [etag] * len(frags)
def make_header(body):
headers = get_header_frag_index(self, body)
headers.update({'X-Object-Sysmeta-Ec-Etag': etags.pop(0)})
return headers
responses = [(200, frag, make_header(frag)) for frag in frags]
codes, body_iter, headers = zip(*(responses + other_responses))
resp_timestamps = (resp_timestamps if resp_timestamps
else [self.obj_timestamp] * len(codes))
resp_timestamps = [ts.internal for ts in resp_timestamps]
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers,
timestamps=resp_timestamps):
with self.assertRaises(DiskFileError) as cm:
self.reconstructor.reconstruct_fa(
job, node, self._create_fragment(2))
return cm.exception
def _verify_error_lines(self, num_frags, other_responses,
exp_useful_responses):
error_lines = self.logger.get_lines_for_level('error')
self.assertEqual(2, len(error_lines), error_lines)
self.assertIn(
'Unable to get enough responses (%d/%d from %d ok responses)'
% (exp_useful_responses, self.policy.ec_ndata, num_frags),
error_lines[0])
bad_codes = collections.Counter(
status for status, _, _ in other_responses)
errors = ', '.join('%s x %s' % (num, code)
for code, num in sorted(bad_codes.items()))
self.assertIn('Unable to get enough responses (%s error responses)'
% errors, error_lines[1])
def _assert_diskfile_quarantined(self):
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines), warning_lines)
self.assertIn('Quarantined object', warning_lines[0])
# Check the diskfile has moved to quarantine dir
data_filename = os.path.basename(self.df._data_file)
df_hash = os.path.basename(self.df._datadir)
quarantine_dir = os.path.join(
self.df._device_path, 'quarantined',
diskfile.get_data_dir(self.policy), df_hash)
self.assertTrue(os.path.isdir(quarantine_dir))
quarantine_file = os.path.join(quarantine_dir, data_filename)
self.assertTrue(os.path.isfile(quarantine_file))
with open(quarantine_file, 'r') as fd:
self.assertEqual('test data', fd.read())
self.assertFalse(os.path.exists(self.df._data_file))
def _assert_diskfile_not_quarantined(self):
# Check the diskfile has not moved to quarantine dir
quarantine_dir = os.path.join(
self.df._device_path, 'quarantined')
self.assertFalse(os.path.isdir(quarantine_dir))
self.assertTrue(os.path.exists(self.df._data_file))
with open(self.df._data_file, 'r') as fd:
self.assertEqual('test data', fd.read())
def test_reconstruct_fa_quarantine_threshold_one_rnc_two_replicas(self):
# use default request_node_count == 2 * replicas
num_other_resps = 2 * self.policy.object_ring.replicas - 2
other_responses = [(404, None, None)] * num_other_resps
conf = {'quarantine_threshold': 1, 'reclaim_age': 0}
exc = self._do_test_reconstruct_insufficient_frags(
conf, 1, other_responses)
self.assertIsInstance(exc, DiskFileQuarantined)
self._assert_diskfile_quarantined()
self._verify_error_lines(1, other_responses, 1)
def test_reconstruct_fa_quarantine_threshold_one_rnc_three_replicas(self):
num_other_resps = 3 * self.policy.object_ring.replicas - 2
other_responses = [(404, None, None)] * num_other_resps
conf = {'quarantine_threshold': 1, 'reclaim_age': 0,
'request_node_count': '3 * replicas'}
# set ring get_more_nodes to yield enough handoffs
self.policy.object_ring.max_more_nodes = (
2 * self.policy.object_ring.replicas)
exc = self._do_test_reconstruct_insufficient_frags(
conf, 1, other_responses)
self.assertIsInstance(exc, DiskFileQuarantined)
self._assert_diskfile_quarantined()
self._verify_error_lines(1, other_responses, 1)
def test_reconstruct_fa_quarantine_threshold_one_rnc_four_replicas(self):
# verify handoff search exhausting handoff node iter
num_other_resps = 3 * self.policy.object_ring.replicas - 2
other_responses = [(404, None, None)] * num_other_resps
conf = {'quarantine_threshold': 1, 'reclaim_age': 0,
'request_node_count': '4 * replicas'}
# limit ring get_more_nodes to yield less than
# (request_node_count - 1 * replicas) nodes
self.policy.object_ring.max_more_nodes = (
2 * self.policy.object_ring.replicas)
exc = self._do_test_reconstruct_insufficient_frags(
conf, 1, other_responses)
self.assertIsInstance(exc, DiskFileQuarantined)
self._assert_diskfile_quarantined()
self._verify_error_lines(1, other_responses, 1)
def test_reconstruct_fa_quarantine_threshold_one_rnc_absolute_number(self):
def do_test(rnc_num):
if rnc_num < self.policy.object_ring.replicas:
num_other_resps = self.policy.object_ring.replicas - 2
else:
num_other_resps = rnc_num - 2
other_responses = [(404, None, None)] * num_other_resps
conf = {'quarantine_threshold': 1, 'reclaim_age': 0,
'request_node_count': str(rnc_num)}
# set ring get_more_nodes to yield enough handoffs
self.policy.object_ring.max_more_nodes = (
2 * self.policy.object_ring.replicas)
exc = self._do_test_reconstruct_insufficient_frags(
conf, 1, other_responses)
self.assertIsInstance(exc, DiskFileQuarantined)
self._assert_diskfile_quarantined()
self._verify_error_lines(1, other_responses, 1)
for rnc_num in range(0, 3 * self.policy.object_ring.replicas):
do_test(rnc_num)
def test_reconstruct_fa_quarantine_threshold_two(self):
num_other_resps = 2 * self.policy.object_ring.replicas - 3
other_responses = [(404, None, None)] * num_other_resps
conf = {'quarantine_threshold': 2, 'reclaim_age': 0}
exc = self._do_test_reconstruct_insufficient_frags(
conf, 2, other_responses)
self.assertIsInstance(exc, DiskFileQuarantined)
self._assert_diskfile_quarantined()
self._verify_error_lines(2, other_responses, 2)
def test_reconstruct_fa_no_quarantine_more_than_threshold_frags(self):
# default config
num_other_resps = self.policy.object_ring.replicas - 2
other_responses = [(404, None, None)] * num_other_resps
exc = self._do_test_reconstruct_insufficient_frags(
{'reclaim_age': 0}, 1, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
# configured quarantine_threshold
for quarantine_threshold in range(self.policy.ec_ndata):
for num_frags in range(quarantine_threshold + 1,
self.policy.ec_ndata):
num_other_resps = (self.policy.object_ring.replicas -
num_frags - 1)
other_responses = [(404, None, None)] * num_other_resps
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': quarantine_threshold,
'reclaim_age': 0},
num_frags, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(num_frags, other_responses, num_frags)
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual([], warning_lines)
# responses include the frag_index_to_rebuild - verify that response is
# counted against the threshold
num_other_resps = self.policy.object_ring.replicas - 3
other_responses = [(404, None, None)] * num_other_resps
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 0}, 2, other_responses,
local_frag_index=2, frag_index_to_rebuild=3)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(2, other_responses, 1)
def test_reconstruct_fa_no_quarantine_non_404_response(self):
num_frags = 1
ring = self.policy.object_ring
for bad_status in (400, 503, 507):
# a non-404 in primary responses will prevent quarantine
num_other_resps = ring.replicas - num_frags - 1
other_responses = [(404, None, None)] * (num_other_resps - 1)
other_responses.append((bad_status, None, None))
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 0},
num_frags, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(num_frags, other_responses, num_frags)
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines), warning_lines)
self.assertIn('Invalid response %s' % bad_status, warning_lines[0])
# a non-404 in handoff responses will prevent quarantine; non-404
# is the *final* handoff response...
ring.max_more_nodes = (13 * ring.replicas)
for request_node_count in (2, 3, 13):
num_other_resps = (request_node_count * ring.replicas
- num_frags - 1)
other_responses = [(404, None, None)] * (num_other_resps - 1)
other_responses.append((bad_status, None, None))
with annotate_failure(
'request_node_count=%d' % request_node_count):
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1,
'reclaim_age': 0,
'request_node_count': '%s * replicas'
% request_node_count},
num_frags, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(num_frags, other_responses, num_frags)
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines), warning_lines)
self.assertIn('Invalid response %s' % bad_status,
warning_lines[0])
# a non-404 in handoff responses will prevent quarantine; non-404
# is part way through all handoffs so not all handoffs are used
# regardless of how big request_node_count is
non_404_handoff = 3
for request_node_count in (2, 3, 13):
# replicas - 1 - num_frags other_responses from primaries,
# plus a batch of replicas - 1 during which non-404 shows up,
# plus some that trickle out before the non-404 shows up, but
# limited to (request_node_count * replicas - num_frags - 1)
# e.g. for 10+4 policy with request_node_count > 2
# - batch of 13 requests go to primaries,
# - 12 other_responses are consumed,
# - then a batch of 13 handoff requests is sent,
# - the non-404 is the 4th response in that batch,
# - so 3 more requests will have been trickled out
batch_size = ring.replicas - 1
num_other_resps = min(
2 * batch_size - num_frags + non_404_handoff,
request_node_count * ring.replicas - 1 - num_frags)
other_responses = [(404, None, None)] * (num_other_resps - 1)
other_responses.insert(
batch_size - num_frags + non_404_handoff,
(bad_status, None, None))
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 0,
'request_node_count': '%s * replicas'
% request_node_count},
num_frags, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(num_frags, other_responses, num_frags)
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines), warning_lines)
self.assertIn('Invalid response %s' % bad_status,
warning_lines[0])
def test_reconstruct_fa_no_quarantine_frag_not_old_enough(self):
# verify that solitary fragment is not quarantined if it has not
# reached reclaim_age
num_other_resps = self.policy.object_ring.replicas - 2
other_responses = [(404, None, None)] * num_other_resps
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 10000},
1, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(1, other_responses, 1)
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1}, # default reclaim_age
1, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(1, other_responses, 1)
def test_reconstruct_fa_no_quarantine_frag_resp_different_timestamp(self):
# verify that solitary fragment is not quarantined if the only frag
# response is for a different timestamp than the local frag
resp_timestamp = utils.Timestamp(float(self.obj_timestamp) + 1)
num_other_resps = self.policy.object_ring.replicas - 2
other_responses = [(404, None, None)] * num_other_resps
resp_timestamps = [resp_timestamp] * (num_other_resps + 1)
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 0},
1, other_responses, resp_timestamps=resp_timestamps)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(1, other_responses, 1)
def test_reconstruct_fa_no_quarantine_frag_resp_mixed_timestamps(self):
# verify that solitary fragment is not quarantined if there is a
# response for a frag at different timestamp in addition to the
# response for the solitary local frag
resp_timestamp = utils.Timestamp(float(self.obj_timestamp) + 1)
num_other_resps = self.policy.object_ring.replicas - 3
other_responses = [(404, None, None)] * num_other_resps
resp_timestamps = ([self.obj_timestamp] +
[resp_timestamp] * (num_other_resps + 1))
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 0},
2, other_responses, resp_timestamps=resp_timestamps)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
error_lines = self.logger.get_lines_for_level('error')
self.assertEqual(3, len(error_lines), error_lines)
self.assertIn(
'Unable to get enough responses (1/%d from 1 ok responses)'
% (self.policy.ec_ndata,), error_lines[0])
self.assertIn(
'Unable to get enough responses (1/%d from 1 ok responses)'
% (self.policy.ec_ndata,), error_lines[1])
self.assertIn(
'Unable to get enough responses (%d x 404 error responses)'
% num_other_resps, error_lines[2])
def test_reconstruct_fa_no_quarantine_frag_resp_mixed_etags(self):
# verify that solitary fragment is not quarantined if there is a
# response for a frag with different etag in addition to the
# response for the solitary local frag
etags = [None, 'unexpected_etag']
num_other_resps = self.policy.object_ring.replicas - 3
other_responses = [(404, None, None)] * num_other_resps
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1, 'reclaim_age': 0},
2, other_responses, resp_etags=etags)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
error_lines = self.logger.get_lines_for_level('error')
self.assertEqual(3, len(error_lines), error_lines)
self.assertIn(
'Mixed Etag', error_lines[0])
self.assertIn(
'Unable to get enough responses (1/%d from 2 ok responses)'
% (self.policy.ec_ndata,), error_lines[1])
self.assertIn(
'Unable to get enough responses (%d x 404 error responses)'
% num_other_resps, error_lines[2])
def _do_test_reconstruct_fa_no_quarantine_bad_headers(self, bad_headers):
# verify that responses with invalid headers count against the
# quarantine_threshold
self._configure_reconstructor(reclaim_age=0, quarantine_threshold=1)
local_frag_index = 2
self._create_fragment(local_frag_index)
job = {
'partition': 0,
'policy': self.policy,
}
part_nodes = self.policy.object_ring.get_part_nodes(0)
node = part_nodes[0]
node['backend_index'] = self.policy.get_backend_index(node['index'])
test_data = (b'rebuild' * self.policy.ec_segment_size)[:-777]
etag = md5(test_data, usedforsecurity=False).hexdigest()
ec_archive_bodies = encode_frag_archive_bodies(self.policy, test_data)
def make_header(body):
headers = get_header_frag_index(self, body)
headers.update({'X-Object-Sysmeta-Ec-Etag': etag})
return headers
responses = []
body = ec_archive_bodies[2]
headers = make_header(body)
responses.append((200, body, headers))
body = ec_archive_bodies[3]
headers = make_header(body)
headers.update(bad_headers)
responses.append((200, body, headers))
other_responses = ([(404, None, None)] *
(self.policy.object_ring.replicas - 3))
codes, body_iter, headers = zip(*(responses + other_responses))
resp_timestamps = [self.obj_timestamp] * len(codes)
resp_timestamps = [ts.internal for ts in resp_timestamps]
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers,
timestamps=resp_timestamps):
with self.assertRaises(DiskFileError) as cm:
self.reconstructor.reconstruct_fa(
job, node, self._create_fragment(2))
self.assertIsInstance(cm.exception, DiskFileError)
self._assert_diskfile_not_quarantined()
error_lines = self.logger.get_lines_for_level('error')
self.assertEqual(2, len(error_lines), error_lines)
self.assertIn(
'Unable to get enough responses (1/%d from 1 ok responses)'
% (self.policy.ec_ndata,), error_lines[0])
self.assertIn(
'Unable to get enough responses '
'(1 x unknown, %d x 404 error responses)'
% len(other_responses), error_lines[1])
def test_reconstruct_fa_no_quarantine_invalid_frag_index_header(self):
self._do_test_reconstruct_fa_no_quarantine_bad_headers(
{'X-Object-Sysmeta-Ec-Frag-Index': 'two'})
def test_reconstruct_fa_no_quarantine_missing_frag_index_header(self):
self._do_test_reconstruct_fa_no_quarantine_bad_headers(
{'X-Object-Sysmeta-Ec-Frag-Index': ''})
def test_reconstruct_fa_no_quarantine_missing_timestamp_header(self):
self._do_test_reconstruct_fa_no_quarantine_bad_headers(
{'X-Backend-Data-Timestamp': ''})
def test_reconstruct_fa_no_quarantine_missing_etag_header(self):
self._do_test_reconstruct_fa_no_quarantine_bad_headers(
{'X-Object-Sysmeta-Ec-Etag': ''})
def test_reconstruct_fa_frags_on_handoffs(self):
# just a lonely old frag on primaries: this appears to be a quarantine
# candidate, but unexpectedly the other frags are found on handoffs so
# expect rebuild
# set reclaim_age to 0 to make lonely frag old enugh for quarantine
self._configure_reconstructor(quarantine_threshold=1, reclaim_age=0)
job = {
'partition': 0,
'policy': self.policy,
}
part_nodes = self.policy.object_ring.get_part_nodes(0)
node = part_nodes[1]
node['backend_index'] = self.policy.get_backend_index(node['index'])
test_data = (b'rebuild' * self.policy.ec_segment_size)[:-777]
etag = md5(test_data, usedforsecurity=False).hexdigest()
ec_archive_bodies = encode_frag_archive_bodies(self.policy, test_data)
broken_body = ec_archive_bodies.pop(1)
# arrange for just one 200 to come from a primary, then 404s, then 200s
# from handoffs
responses = list()
for i, body in enumerate(ec_archive_bodies):
if i == 1:
# skip: this is the frag index we're rebuilding; insert 404s
responses.extend(
((404, None, None),) * self.policy.object_ring.replicas)
headers = get_header_frag_index(self, body)
headers.update({'X-Object-Sysmeta-Ec-Etag': etag})
responses.append((200, body, headers))
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(
*codes, body_iter=body_iter, headers=headers,
timestamps=[self.obj_timestamp.internal] * len(codes)):
df = self.reconstructor.reconstruct_fa(
job, node, self._create_fragment(0, body=b''))
self.assertEqual(0, df.content_length)
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(md5(fixed_body, usedforsecurity=False).hexdigest(),
md5(broken_body, usedforsecurity=False).hexdigest())
# no error and warning
self.assertFalse(self.logger.get_lines_for_level('error'))
self.assertFalse(self.logger.get_lines_for_level('warning'))
debug_lines = self.logger.get_lines_for_level('debug')
self.assertIn('Reconstructing frag from handoffs, node_count=%d'
% (self.policy.object_ring.replicas * 2), debug_lines)
def test_reconstruct_fa_finds_duplicate_does_not_fail(self):
job = {
'partition': 0,
@ -5280,7 +5866,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
codes, body_iter, headers = zip(*responses)
with mocked_http_conn(*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -5339,7 +5925,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(
*codes, body_iter=body_iter, headers=headers) as mock_conn:
df = self.reconstructor.reconstruct_fa(
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -5408,7 +5994,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
with mocked_http_conn(
*codes, body_iter=body_iter, headers=headers) as mock_conn:
df = self.reconstructor.reconstruct_fa(
job, node, self.obj_metadata)
job, node, self._create_fragment(2))
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(
@ -5439,7 +6025,60 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
@patch_policies(with_ec_default=True)
class TestReconstructFragmentArchiveUTF8(TestReconstructFragmentArchive):
# repeat superclass tests with an object path that contains non-ascii chars
obj_path = b'/a/c/o\xc3\xa8'
obj_name = b'o\xc3\xa8'
@patch_policies([ECStoragePolicy(0, name='ec', is_default=True,
ec_type=DEFAULT_TEST_EC_TYPE,
ec_ndata=10, ec_nparity=4,
ec_segment_size=4096,
ec_duplication_factor=2),
StoragePolicy(1, name='other')],
fake_ring_args=[{'replicas': 28}, {'replicas': 3}])
class TestReconstructFragmentArchiveECDuplicationFactor(
TestReconstructFragmentArchive):
def test_reconstruct_fa_no_quarantine_duplicate_frags(self):
# verify that quarantine does not happen if the only other response in
# addition to the lonely frag's own response is for the same
# (duplicate) frag index
self._configure_reconstructor(quarantine_threshold=1, reclaim_age=0)
local_frag_index = 2
self._create_fragment(local_frag_index)
job = {
'partition': 0,
'policy': self.policy,
}
part_nodes = self.policy.object_ring.get_part_nodes(0)
node = part_nodes[0]
node['backend_index'] = self.policy.get_backend_index(node['index'])
test_data = (b'rebuild' * self.policy.ec_segment_size)[:-777]
etag = md5(test_data, usedforsecurity=False).hexdigest()
ec_archive_bodies = encode_frag_archive_bodies(self.policy, test_data)
frags = [
ec_archive_bodies[local_frag_index],
ec_archive_bodies[local_frag_index +
self.policy.ec_n_unique_fragments]]
def make_header(body):
headers = get_header_frag_index(self, body)
headers.update({'X-Object-Sysmeta-Ec-Etag': etag})
return headers
responses = [(200, frag, make_header(frag)) for frag in frags]
other_responses = ([(404, None, None)] *
(self.policy.ec_n_unique_fragments * 2 - 3))
codes, body_iter, headers = zip(*(responses + other_responses))
resp_timestamps = [self.obj_timestamp.internal] * len(codes)
with mocked_http_conn(*codes, body_iter=body_iter,
headers=headers,
timestamps=resp_timestamps):
with self.assertRaises(DiskFileError) as cm:
self.reconstructor.reconstruct_fa(
job, node, self._create_fragment(2))
self.assertIsInstance(cm.exception, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(2, other_responses, 1)
@patch_policies([ECStoragePolicy(0, name='ec', is_default=True,
@ -5455,6 +6094,13 @@ class TestObjectReconstructorECDuplicationFactor(TestObjectReconstructor):
self.fabricated_ring = FabricatedRing(replicas=28, devices=56)
def _test_reconstruct_with_duplicate_frags_no_errors(self, index):
utils.mkdirs(os.path.join(self.devices, 'sda1'))
df_mgr = self.reconstructor._df_router[self.policy]
df = df_mgr.get_diskfile('sda1', 9, 'a', 'c', 'o',
policy=self.policy)
write_diskfile(df, self.ts(), data=b'', frag_index=2)
df.open()
job = {
'partition': 0,
'policy': self.policy,
@ -5462,12 +6108,6 @@ class TestObjectReconstructorECDuplicationFactor(TestObjectReconstructor):
part_nodes = self.policy.object_ring.get_part_nodes(0)
node = part_nodes[index]
node['backend_index'] = self.policy.get_backend_index(node['index'])
metadata = {
'name': '/a/c/o',
'Content-Length': 0,
'ETag': 'etag',
'X-Timestamp': '1234567890.12345',
}
test_data = (b'rebuild' * self.policy.ec_segment_size)[:-777]
etag = md5(test_data, usedforsecurity=False).hexdigest()
@ -5486,9 +6126,9 @@ class TestObjectReconstructorECDuplicationFactor(TestObjectReconstructor):
called_headers = []
orig_func = object_reconstructor.ObjectReconstructor._get_response
def _get_response_hook(self, node, part, path, headers, policy):
def _get_response_hook(self, node, policy, part, path, headers):
called_headers.append(headers)
return orig_func(self, node, part, path, headers, policy)
return orig_func(self, node, policy, part, path, headers)
# need parity + 1 node failures to reach duplicated fragments
failed_start_at = (
@ -5505,7 +6145,7 @@ class TestObjectReconstructorECDuplicationFactor(TestObjectReconstructor):
with mocked_http_conn(
*codes, body_iter=body_iter, headers=headers):
df = self.reconstructor.reconstruct_fa(
job, node, metadata)
job, node, df)
fixed_body = b''.join(df.reader())
self.assertEqual(len(fixed_body), len(broken_body))
self.assertEqual(

View File

@ -16,11 +16,9 @@ from collections import defaultdict
import mock
import os
import time
import unittest
import eventlet
import itertools
from six.moves import urllib
from swift.common.exceptions import DiskFileNotExist, DiskFileError, \
@ -28,8 +26,7 @@ from swift.common.exceptions import DiskFileNotExist, DiskFileError, \
from swift.common import swob
from swift.common import utils
from swift.common.storage_policy import POLICIES, EC_POLICY
from swift.common.utils import Timestamp
from swift.obj import ssync_sender, server
from swift.obj import ssync_sender, server, diskfile
from swift.obj.reconstructor import RebuildingECDiskFileStream, \
ObjectReconstructor
from swift.obj.replicator import ObjectReplicator
@ -38,7 +35,7 @@ from test import listen_zero
from test.debug_logger import debug_logger
from test.unit.obj.common import BaseTest
from test.unit import patch_policies, encode_frag_archive_bodies, \
skip_if_no_xattrs, quiet_eventlet_exceptions
skip_if_no_xattrs, quiet_eventlet_exceptions, make_timestamp_iter
class TestBaseSsync(BaseTest):
@ -62,8 +59,7 @@ class TestBaseSsync(BaseTest):
'log_requests': 'false'}
self.rx_logger = debug_logger()
self.rx_controller = server.ObjectController(conf, self.rx_logger)
self.ts_iter = (Timestamp(t)
for t in itertools.count(int(time.time())))
self.ts_iter = make_timestamp_iter()
self.rx_ip = '127.0.0.1'
sock = listen_zero()
self.rx_server = eventlet.spawn(
@ -653,11 +649,12 @@ class TestSsyncEC(TestBaseSsyncEC):
reconstruct_fa_calls = []
def fake_reconstruct_fa(job, node, metadata):
reconstruct_fa_calls.append((job, node, policy, metadata))
def fake_reconstruct_fa(job, node, df):
reconstruct_fa_calls.append((job, node, policy, df))
if len(reconstruct_fa_calls) == 2:
# simulate second reconstruct failing
raise DiskFileError
metadata = df.get_datafile_metadata()
content = self._get_object_data(metadata['name'],
frag_index=rx_node_index)
return RebuildingECDiskFileStream(
@ -702,7 +699,8 @@ class TestSsyncEC(TestBaseSsyncEC):
# remove the failed df from expected synced df's
expect_sync_paths = ['/a/c/o1', '/a/c/o2', '/a/c/o3', '/a/c/o5']
failed_path = reconstruct_fa_calls[1][3]['name']
failed_df = reconstruct_fa_calls[1][3]
failed_path = failed_df.get_datafile_metadata()['name']
expect_sync_paths.remove(failed_path)
failed_obj = None
for obj, diskfiles in tx_objs.items():
@ -843,26 +841,26 @@ class TestSsyncEC(TestBaseSsyncEC):
class FakeResponse(object):
def __init__(self, frag_index, obj_data, length=None):
self.headers = {
'X-Object-Sysmeta-Ec-Frag-Index': str(frag_index),
'X-Object-Sysmeta-Ec-Etag': 'the etag',
'X-Backend-Timestamp': '1234567890.12345'
}
def __init__(self, frag_index, obj_data, length=None, status=200):
self.frag_index = frag_index
self.obj_data = obj_data
self.data = b''
self.length = length
self.status = 200
self.status = status
def init(self, path):
def init(self, path, conf):
if isinstance(self.obj_data, Exception):
self.data = self.obj_data
else:
self.data = self.obj_data[path][self.frag_index]
self.conf = conf
def getheaders(self):
return self.headers
return {
'X-Object-Sysmeta-Ec-Frag-Index': str(self.frag_index),
'X-Object-Sysmeta-Ec-Etag': 'the etag',
'X-Backend-Timestamp': self.conf['timestamp'].internal
}
def read(self, length):
if isinstance(self.data, Exception):
@ -878,7 +876,9 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
self.rx_node_index = 0
self.tx_node_index = 1
# create sender side diskfiles...
# create sender side diskfiles...ensure their timestamps are in the
# past so that tests that set reclaim_age=0 succeed in reclaiming
self.ts_iter = make_timestamp_iter(offset=-1000)
self.tx_objs = {}
tx_df_mgr = self.daemon._df_router[self.policy]
t1 = next(self.ts_iter)
@ -887,6 +887,8 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
t2 = next(self.ts_iter)
self.tx_objs['o2'] = self._create_ondisk_files(
tx_df_mgr, 'o2', self.policy, t2, (self.tx_node_index,))
self.response_confs = {'/a/c/o1': {'timestamp': t1},
'/a/c/o2': {'timestamp': t2}}
self.suffixes = set()
for diskfiles in list(self.tx_objs.values()):
@ -900,7 +902,7 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
self.frag_length = int(
self.tx_objs['o1'][0].get_metadata()['Content-Length'])
def _test_reconstructor_sync_job(self, frag_responses):
def _test_reconstructor_sync_job(self, frag_responses, custom_conf=None):
# Helper method to mock reconstructor to consume given lists of fake
# responses while reconstructing a fragment for a sync type job. The
# tests verify that when the reconstructed fragment iter fails in some
@ -908,25 +910,31 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
# node which have incorrect data.
# See https://bugs.launchpad.net/swift/+bug/1631144
custom_conf = custom_conf if custom_conf else {}
# frag_responses is a list of two lists of responses to each
# reconstructor GET request for a fragment archive. The two items in
# the outer list are lists of responses for each of the two fragments
# to be reconstructed. Items in the inner lists are responses for each
# of the other fragments fetched during the reconstructor rebuild.
# to be reconstructed, and are used in the order that ssync syncs the
# fragments. Items in the inner lists are responses for each of the
# other fragments fetched during the reconstructor rebuild.
path_to_responses = {}
fake_get_response_calls = []
def fake_get_response(recon, node, part, path, headers, policy):
def fake_get_response(recon, node, policy, part, path, headers):
# select a list of fake responses for this path and return the next
# from the list
# from the list: we don't know the order in which paths will show
# up but we do want frag_responses[0] to be used first, so the
# frag_responses aren't bound to a path until this point
if path not in path_to_responses:
path_to_responses[path] = frag_responses.pop(0)
response = path_to_responses[path].pop()
# the frag_responses list is in ssync task order, we only know the
# the frag_responses list is in ssync task order: we only know the
# path when consuming the responses so initialise the path in the
# response now
if response:
response.init(path)
response.init(path, self.response_confs[path])
# should be full path but just used for logging...
response.full_path = path
fake_get_response_calls.append(path)
return response
@ -944,17 +952,19 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
mock.patch.object(
self.policy.object_ring, 'get_part_nodes',
fake_get_part_nodes):
self.reconstructor = ObjectReconstructor(
{}, logger=self.logger)
conf = self.daemon_conf
conf.update(custom_conf)
self.reconstructor = ObjectReconstructor(conf, logger=self.logger)
job = {
'device': self.device,
'partition': self.partition,
'policy': self.policy,
'frag_index': self.tx_node_index,
'sync_diskfile_builder':
self.reconstructor.reconstruct_fa
}
sender = ssync_sender.Sender(
self.daemon, self.job_node, job, self.suffixes)
self.reconstructor, self.job_node, job, self.suffixes)
sender.connect, trace = self.make_connect_wrapper(sender)
sender()
return trace
@ -975,7 +985,7 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
df = self._open_rx_diskfile(
obj_name, self.policy, self.rx_node_index)
msgs.append('Unexpected rx diskfile for %r with content %r' %
(obj_name, ''.join([d for d in df.reader()])))
(obj_name, b''.join([d for d in df.reader()])))
except DiskFileNotExist:
pass # expected outcome
if msgs:
@ -987,6 +997,7 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
# trampoline for the receiver to write a log
eventlet.sleep(0)
log_lines = self.rx_logger.get_lines_for_level('warning')
self.assertEqual(1, len(log_lines), self.rx_logger.all_log_lines())
self.assertIn('ssync subrequest failed with 499',
log_lines[0])
self.assertFalse(log_lines[1:])
@ -1009,7 +1020,7 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
df = self._open_rx_diskfile(
obj_name, self.policy, self.rx_node_index)
msgs.append('Unexpected rx diskfile for %r with content %r' %
(obj_name, ''.join([d for d in df.reader()])))
(obj_name, b''.join([d for d in df.reader()])))
except DiskFileNotExist:
pass # expected outcome
if msgs:
@ -1053,7 +1064,7 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
df = self._open_rx_diskfile(
obj_name, self.policy, self.rx_node_index)
msgs.append('Unexpected rx diskfile for %r with content %r' %
(obj_name, ''.join([d for d in df.reader()])))
(obj_name, b''.join([d for d in df.reader()])))
except DiskFileNotExist:
pass # expected outcome
if msgs:
@ -1111,7 +1122,7 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
df = self._open_rx_diskfile(
obj_name, self.policy, self.rx_node_index)
msgs.append('Unexpected rx diskfile for %r with content %r' %
(obj_name, ''.join([d for d in df.reader()])))
(obj_name, b''.join([d for d in df.reader()])))
except DiskFileNotExist:
pass # expected outcome
if msgs:
@ -1123,6 +1134,109 @@ class TestSsyncECReconstructorSyncJob(TestBaseSsyncEC):
self.assertFalse(self.rx_logger.get_lines_for_level('warning'))
self.assertFalse(self.rx_logger.get_lines_for_level('error'))
def test_sync_reconstructor_quarantines_lonely_frag(self):
# First fragment to sync gets only one response for reconstructor to
# rebuild with, and that response is for the tx_node frag index: it
# should be quarantined, but after that the ssync session should still
# proceeed with rebuilding the second frag.
lonely_frag_responses = [
FakeResponse(i, self.obj_data, status=404)
for i in range(self.policy.ec_ndata + self.policy.ec_nparity)]
lonely_frag_responses[self.tx_node_index].status = 200
frag_responses = [
lonely_frag_responses,
[FakeResponse(i, self.obj_data)
for i in range(self.policy.ec_ndata + self.policy.ec_nparity)]]
# configure reconstructor to quarantine the lonely frag
custom_conf = {'reclaim_age': 0, 'quarantine_threshold': 1}
trace = self._test_reconstructor_sync_job(frag_responses, custom_conf)
results = self._analyze_trace(trace)
self.assertEqual(2, len(results['tx_missing']))
self.assertEqual(2, len(results['rx_missing']))
self.assertEqual(1, len(results['tx_updates']))
self.assertFalse(results['rx_updates'])
self.assertEqual('PUT', results['tx_updates'][0].get('method'))
synced_obj_path = results['tx_updates'][0].get('path')
synced_obj_name = synced_obj_path[-2:]
# verify that the second frag was rebuilt on rx node...
msgs = []
try:
df = self._open_rx_diskfile(
synced_obj_name, self.policy, self.rx_node_index)
self.assertEqual(
self._get_object_data(synced_obj_path,
frag_index=self.rx_node_index),
b''.join([d for d in df.reader()]))
except DiskFileNotExist:
msgs.append('Missing rx diskfile for %r' % synced_obj_name)
# ...and it is still on tx node...
try:
df = self._open_tx_diskfile(
synced_obj_name, self.policy, self.tx_node_index)
self.assertEqual(
self._get_object_data(df._name,
frag_index=self.tx_node_index),
b''.join([d for d in df.reader()]))
except DiskFileNotExist:
msgs.append('Missing tx diskfile for %r' % synced_obj_name)
# verify that the lonely frag was not rebuilt on rx node and was
# removed on tx node
obj_names = list(self.tx_objs)
obj_names.remove(synced_obj_name)
quarantined_obj_name = obj_names[0]
try:
df = self._open_rx_diskfile(
quarantined_obj_name, self.policy, self.rx_node_index)
msgs.append(
'Unexpected rx diskfile for %r with content %r' %
(quarantined_obj_name, b''.join([d for d in df.reader()])))
except DiskFileNotExist:
pass # expected outcome
try:
df = self._open_tx_diskfile(
quarantined_obj_name, self.policy, self.tx_node_index)
msgs.append(
'Unexpected tx diskfile for %r with content %r' %
(quarantined_obj_name, b''.join([d for d in df.reader()])))
except DiskFileNotExist:
pass # expected outcome
if msgs:
self.fail('Failed with:\n%s' % '\n'.join(msgs))
error_lines = self.logger.get_lines_for_level('error')
self.assertEqual(2, len(error_lines), error_lines)
self.assertIn('Unable to get enough responses', error_lines[0])
self.assertIn('Unable to get enough responses', error_lines[1])
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines), warning_lines)
self.assertIn('Quarantined object', warning_lines[0])
# check we have a quarantined data file
df_mgr = self.daemon._df_router[self.policy]
quarantined_df = df_mgr.get_diskfile(
self.device, self.partition, account='a', container='c',
obj=quarantined_obj_name, policy=self.policy,
frag_index=self.tx_node_index)
df_hash = os.path.basename(quarantined_df._datadir)
quarantine_dir = os.path.join(
quarantined_df._device_path, 'quarantined',
diskfile.get_data_dir(self.policy), df_hash)
self.assertTrue(os.path.isdir(quarantine_dir))
data_file = os.listdir(quarantine_dir)[0]
with open(os.path.join(quarantine_dir, data_file), 'rb') as fd:
self.assertEqual(
self._get_object_data(quarantined_df._name,
frag_index=self.tx_node_index),
fd.read())
# trampoline for the receiver to write a log
eventlet.sleep(0)
self.assertFalse(self.rx_logger.get_lines_for_level('warning'))
self.assertFalse(self.rx_logger.get_lines_for_level('error'))
def test_sync_reconstructor_rebuild_ok(self):
# Sanity test for this class of tests. Both fragments get a full
# complement of responses and rebuild correctly.

View File

@ -4699,10 +4699,30 @@ class TestReplicatedObjectController(
self.assertEqual(resp.status_int, 503)
def test_node_request_setting(self):
baseapp = proxy_server.Application({'request_node_count': '3'},
# default is 2 * replicas
baseapp = proxy_server.Application({},
container_ring=FakeRing(),
account_ring=FakeRing())
self.assertEqual(baseapp.request_node_count(3), 3)
self.assertEqual(6, baseapp.request_node_count(3))
def do_test(value, replicas, expected):
baseapp = proxy_server.Application({'request_node_count': value},
container_ring=FakeRing(),
account_ring=FakeRing())
self.assertEqual(expected, baseapp.request_node_count(replicas))
do_test('3', 4, 3)
do_test('1 * replicas', 4, 4)
do_test('2 * replicas', 4, 8)
do_test('4', 4, 4)
do_test('5', 4, 5)
for bad in ('1.1', 1.1, 'auto', 'bad',
'2.5 * replicas', 'two * replicas'):
with self.assertRaises(ValueError):
proxy_server.Application({'request_node_count': bad},
container_ring=FakeRing(),
account_ring=FakeRing())
def test_iter_nodes(self):
with save_globals():