From bcff1282b579f48efdc9bfcdde5b666b9ab23d3a Mon Sep 17 00:00:00 2001 From: Pete Zaitcev Date: Wed, 22 Jul 2020 22:43:13 -0500 Subject: [PATCH] Band-aid and test the crash of the account server We have a better fix in the works, see the change Ic53068867feb0c18c88ddbe029af83a970336545. But it is taking too long to coalesce and users are unhappy right now. Related: rhbz#1838242, rhbz#1965348 Change-Id: I3f7bfc2877355b7cb433af77c4e2dfdfa94ff14d --- swift/account/server.py | 5 +- swift/container/updater.py | 15 ++- test/probe/common.py | 28 ++++- test/probe/test_container_failures.py | 30 +----- test/probe/test_object_failures.py | 2 +- test/probe/test_orphan_container.py | 150 ++++++++++++++++++++++++++ 6 files changed, 196 insertions(+), 34 deletions(-) create mode 100644 test/probe/test_orphan_container.py diff --git a/swift/account/server.py b/swift/account/server.py index 592606a21a..c09468ce8b 100644 --- a/swift/account/server.py +++ b/swift/account/server.py @@ -185,8 +185,9 @@ class AccountController(BaseStorageServer): broker.initialize(timestamp.internal) except DatabaseAlreadyExists: pass - if req.headers.get('x-account-override-deleted', 'no').lower() != \ - 'yes' and broker.is_deleted(): + if (req.headers.get('x-account-override-deleted', 'no').lower() != + 'yes' and broker.is_deleted()) \ + or not os.path.exists(broker.db_file): return HTTPNotFound(request=req) broker.put_container(container, req.headers['x-put-timestamp'], req.headers['x-delete-timestamp'], diff --git a/swift/container/updater.py b/swift/container/updater.py index b2a109274e..396c16630b 100644 --- a/swift/container/updater.py +++ b/swift/container/updater.py @@ -271,9 +271,13 @@ class ContainerUpdater(Daemon): info['storage_policy_index']) for node in nodes] successes = 0 + stub404s = 0 for event in events: - if is_success(event.wait()): + result = event.wait() + if is_success(result): successes += 1 + if result == 404: + stub404s += 1 if successes >= majority_size(len(events)): self.logger.increment('successes') self.successes += 1 @@ -283,6 +287,15 @@ class ContainerUpdater(Daemon): broker.reported(info['put_timestamp'], info['delete_timestamp'], info['object_count'], info['bytes_used']) + elif stub404s == len(events): + self.logger.increment('failures') + self.failures += 1 + self.logger.debug( + _('Update report stub for %(container)s %(dbfile)s'), + {'container': container, 'dbfile': dbfile}) + broker.quarantine('no account replicas exist') + # All that's left at this point is a few sacks of Gnocchi, + # easily collected by the dark data watcher in object auditor. else: self.logger.increment('failures') self.failures += 1 diff --git a/test/probe/common.py b/test/probe/common.py index 6780855734..d2bdfb4dc6 100644 --- a/test/probe/common.py +++ b/test/probe/common.py @@ -36,8 +36,8 @@ from swiftclient import get_auth, head_account, client from swift.common import internal_client, direct_client, utils from swift.common.direct_client import DirectClientException from swift.common.ring import Ring -from swift.common.utils import readconf, renamer, \ - rsync_module_interpolation, md5 +from swift.common.utils import hash_path, md5, \ + readconf, renamer, rsync_module_interpolation from swift.common.manager import Manager from swift.common.storage_policy import POLICIES, EC_POLICY, REPL_POLICY from swift.obj.diskfile import get_data_dir @@ -582,6 +582,13 @@ class ProbeTest(unittest.TestCase): return daemon +def _get_db_file_path(obj_dir): + files = sorted(os.listdir(obj_dir), reverse=True) + for filename in files: + if filename.endswith('db'): + return os.path.join(obj_dir, filename) + + class ReplProbeTest(ProbeTest): acct_cont_required_replicas = 3 @@ -625,6 +632,23 @@ class ReplProbeTest(ProbeTest): return self.direct_container_op(direct_client.direct_get_container, account, container, expect_failure) + def get_container_db_files(self, container): + opart, onodes = self.container_ring.get_nodes(self.account, container) + onode = onodes[0] + db_files = [] + for onode in onodes: + node_id = (onode['port'] % 100) // 10 + device = onode['device'] + hash_str = hash_path(self.account, container) + server_conf = readconf(self.configs['container-server'][node_id]) + devices = server_conf['app:container-server']['devices'] + obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices, + device, opart, + hash_str[-3:], hash_str) + db_files.append(_get_db_file_path(obj_dir)) + + return db_files + class ECProbeTest(ProbeTest): diff --git a/test/probe/test_container_failures.py b/test/probe/test_container_failures.py index c8587b6b7b..6a3beb1042 100644 --- a/test/probe/test_container_failures.py +++ b/test/probe/test_container_failures.py @@ -14,8 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from os import listdir -from os.path import join as path_join from unittest import main from uuid import uuid4 @@ -26,20 +24,13 @@ from swiftclient import client from swift.common import direct_client from swift.common.exceptions import ClientException -from swift.common.utils import hash_path, readconf +from swift.common.utils import readconf from test.probe.common import kill_nonprimary_server, \ kill_server, ReplProbeTest, start_server eventlet.monkey_patch(all=False, socket=True) -def get_db_file_path(obj_dir): - files = sorted(listdir(obj_dir), reverse=True) - for filename in files: - if filename.endswith('db'): - return path_join(obj_dir, filename) - - class TestContainerFailures(ReplProbeTest): def test_one_node_fails(self): @@ -154,23 +145,6 @@ class TestContainerFailures(ReplProbeTest): client.put_object(self.url, self.token, container1, 'obj3', 'x') self.assertEqual(caught.exception.http_status, 503) - def _get_container_db_files(self, container): - opart, onodes = self.container_ring.get_nodes(self.account, container) - onode = onodes[0] - db_files = [] - for onode in onodes: - node_id = (onode['port'] % 100) // 10 - device = onode['device'] - hash_str = hash_path(self.account, container) - server_conf = readconf(self.configs['container-server'][node_id]) - devices = server_conf['app:container-server']['devices'] - obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices, - device, opart, - hash_str[-3:], hash_str) - db_files.append(get_db_file_path(obj_dir)) - - return db_files - def test_locked_container_dbs(self): def run_test(num_locks, catch_503): @@ -179,7 +153,7 @@ class TestContainerFailures(ReplProbeTest): # Get the container info into memcache (so no stray # get_container_info calls muck up our timings) client.get_container(self.url, self.token, container) - db_files = self._get_container_db_files(container) + db_files = self.get_container_db_files(container) db_conns = [] for i in range(num_locks): db_conn = connect(db_files[i]) diff --git a/test/probe/test_object_failures.py b/test/probe/test_object_failures.py index b65e44789d..0321473310 100644 --- a/test/probe/test_object_failures.py +++ b/test/probe/test_object_failures.py @@ -63,7 +63,7 @@ class TestObjectFailures(ReplProbeTest): opart, onodes = self.object_ring.get_nodes( self.account, container, obj) onode = onodes[0] - node_id = (onode['port'] % 100) / 10 + node_id = (onode['port'] % 100) // 10 device = onode['device'] hash_str = hash_path(self.account, container, obj) obj_server_conf = readconf(self.configs['object-server'][node_id]) diff --git a/test/probe/test_orphan_container.py b/test/probe/test_orphan_container.py new file mode 100644 index 0000000000..414cb2bd7a --- /dev/null +++ b/test/probe/test_orphan_container.py @@ -0,0 +1,150 @@ +#!/usr/bin/python -u +# Copyright (c) 2010-2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from swiftclient import client +from unittest import main + +from swift.common.exceptions import LockTimeout +from swift.common.manager import Manager +from swift.common.utils import hash_path, readconf, Timestamp +from swift.container.backend import ContainerBroker + +from test.probe.common import ( + kill_nonprimary_server, kill_server, start_server, ReplProbeTest) + +# Why is this not called test_container_orphan? Because the crash +# happens in the account server, so both account and container +# services are involved. +# +# The common way users do this is to use TripleO to deploy an overcloud +# and add Gnocchi. Gnocchi is hammering Swift, its container has updates +# all the time. Then, users crash the overcloud and re-deploy it, +# using the new suffix in swift.conf. Thereafter, container service +# inherits old container with outstanding updates, container updater +# tries to send updates to the account server, while the account cannot +# be found anymore. In this situation, in Swift 2.25.0, account server +# tracebacks, and the cycle continues without end. + + +class TestOrphanContainer(ReplProbeTest): + + def get_account_db_files(self, account): + + # This is "more correct" than (port_num%100)//10, but is it worth it? + # We have the assumption about port_num vs node_id embedded all over. + account_configs = {} + for _, cname in self.configs['account-server'].items(): + conf = readconf(cname) + # config parser cannot know if it's a number or not, so int() + port = int(conf['app:account-server']['bind_port']) + account_configs[port] = conf + + part, nodes = self.account_ring.get_nodes(account) + hash_str = hash_path(account) + + ret = [] + for node in nodes: + + data_dir = 'accounts' + device = node['device'] + conf = account_configs[node['port']] + devices = conf['app:account-server']['devices'] + + # os.path.join is for the weak + db_file = '%s/%s/%s/%s/%s/%s/%s.db' % ( + devices, device, data_dir, part, + hash_str[-3:], hash_str, hash_str) + ret.append(db_file) + return ret + + def test_update_pending(self): + + # Create container + container = 'contx' + client.put_container(self.url, self.token, container) + + part, nodes = self.account_ring.get_nodes(self.account) + anode = nodes[0] + + # Stop a quorum of account servers + # This allows the put to continue later. + kill_nonprimary_server(nodes, self.ipport2server) + kill_server((anode['ip'], anode['port']), self.ipport2server) + + # Put object + # This creates an outstanding update. + client.put_object(self.url, self.token, container, 'object1', b'123') + + cont_db_files = self.get_container_db_files(container) + self.assertEqual(len(cont_db_files), 3) + + # Collect the observable state from containers + outstanding_files = [] + for cfile in cont_db_files: + broker = ContainerBroker(cfile) + try: + info = broker.get_info() + except LockTimeout: + self.fail('LockTimeout at %s' % (cfile,)) + if Timestamp(info['put_timestamp']) <= 0: + self.fail('No put_timestamp at %s' % (cfile,)) + # Correct even if reported_put_timestamp is zero. + if info['put_timestamp'] > info['reported_put_timestamp']: + outstanding_files.append(cfile) + self.assertGreater(len(outstanding_files), 0) + + # At this point the users shut everything down and screw up the + # hash in swift.conf. But we destroy the account DB instead. + files = self.get_account_db_files(self.account) + for afile in files: + os.unlink(afile) + + # Restart the stopped primary server + start_server((anode['ip'], anode['port']), self.ipport2server) + + # Make sure updaters run + Manager(['container-updater']).once() + + # Collect the observable state from containers again and examine it + outstanding_files_new = [] + for cfile in cont_db_files: + + # We aren't catching DatabaseConnectionError, because + # we only want to approve of DBs that were quarantined, + # and not otherwise damaged. So if the code below throws + # an exception for other reason, we want the test to fail. + if not os.path.exists(cfile): + continue + + broker = ContainerBroker(cfile) + try: + info = broker.get_info() + except LockTimeout: + self.fail('LockTimeout at %s' % (cfile,)) + if Timestamp(info['put_timestamp']) <= 0: + self.fail('No put_timestamp at %s' % (cfile,)) + # Correct even if reported_put_timestamp is zero. + if info['put_timestamp'] > info['reported_put_timestamp']: + outstanding_files_new.append(cfile) + self.assertLengthEqual(outstanding_files_new, 0) + + self.get_to_final_state() + + +if __name__ == '__main__': + main()