Band-aid and test the crash of the account server

We have a better fix in the works, see the change Ic53068867feb0c18c88ddbe029af83a970336545. But it is taking too long to coalesce and users are unhappy right now. Related: rhbz#1838242, rhbz#1965348 Change-Id: I3f7bfc2877355b7cb433af77c4e2dfdfa94ff14d
2020-07-22 22:43:13 -05:00
parent 901d2e15b7
commit bcff1282b5
6 changed files with 196 additions and 34 deletions
--- a/swift/account/server.py
+++ b/swift/account/server.py
@@ -185,8 +185,9 @@ class AccountController(BaseStorageServer):
                    broker.initialize(timestamp.internal)
                except DatabaseAlreadyExists:
                    pass
-            if req.headers.get('x-account-override-deleted', 'no').lower() != \
-                    'yes' and broker.is_deleted():
+            if (req.headers.get('x-account-override-deleted', 'no').lower() !=
+                    'yes' and broker.is_deleted()) \
+                    or not os.path.exists(broker.db_file):
                return HTTPNotFound(request=req)
            broker.put_container(container, req.headers['x-put-timestamp'],
                                 req.headers['x-delete-timestamp'],
--- a/swift/container/updater.py
+++ b/swift/container/updater.py
@@ -271,9 +271,13 @@ class ContainerUpdater(Daemon):
                            info['storage_policy_index'])
                      for node in nodes]
            successes = 0
+            stub404s = 0
            for event in events:
-                if is_success(event.wait()):
+                result = event.wait()
+                if is_success(result):
                    successes += 1
+                if result == 404:
+                    stub404s += 1
            if successes >= majority_size(len(events)):
                self.logger.increment('successes')
                self.successes += 1
@@ -283,6 +287,15 @@ class ContainerUpdater(Daemon):
                broker.reported(info['put_timestamp'],
                                info['delete_timestamp'], info['object_count'],
                                info['bytes_used'])
+            elif stub404s == len(events):
+                self.logger.increment('failures')
+                self.failures += 1
+                self.logger.debug(
+                    _('Update report stub for %(container)s %(dbfile)s'),
+                    {'container': container, 'dbfile': dbfile})
+                broker.quarantine('no account replicas exist')
+                # All that's left at this point is a few sacks of Gnocchi,
+                # easily collected by the dark data watcher in object auditor.
            else:
                self.logger.increment('failures')
                self.failures += 1
--- a/test/probe/common.py
+++ b/test/probe/common.py
@@ -36,8 +36,8 @@ from swiftclient import get_auth, head_account, client
 from swift.common import internal_client, direct_client, utils
 from swift.common.direct_client import DirectClientException
 from swift.common.ring import Ring
-from swift.common.utils import readconf, renamer, \
-    rsync_module_interpolation, md5
+from swift.common.utils import hash_path, md5, \
+    readconf, renamer, rsync_module_interpolation
 from swift.common.manager import Manager
 from swift.common.storage_policy import POLICIES, EC_POLICY, REPL_POLICY
 from swift.obj.diskfile import get_data_dir
@@ -582,6 +582,13 @@ class ProbeTest(unittest.TestCase):
        return daemon


+def _get_db_file_path(obj_dir):
+    files = sorted(os.listdir(obj_dir), reverse=True)
+    for filename in files:
+        if filename.endswith('db'):
+            return os.path.join(obj_dir, filename)
+
+
 class ReplProbeTest(ProbeTest):

    acct_cont_required_replicas = 3
@@ -625,6 +632,23 @@ class ReplProbeTest(ProbeTest):
        return self.direct_container_op(direct_client.direct_get_container,
                                        account, container, expect_failure)

+    def get_container_db_files(self, container):
+        opart, onodes = self.container_ring.get_nodes(self.account, container)
+        onode = onodes[0]
+        db_files = []
+        for onode in onodes:
+            node_id = (onode['port'] % 100) // 10
+            device = onode['device']
+            hash_str = hash_path(self.account, container)
+            server_conf = readconf(self.configs['container-server'][node_id])
+            devices = server_conf['app:container-server']['devices']
+            obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
+                                                      device, opart,
+                                                      hash_str[-3:], hash_str)
+            db_files.append(_get_db_file_path(obj_dir))
+
+        return db_files
+

 class ECProbeTest(ProbeTest):

--- a/test/probe/test_container_failures.py
+++ b/test/probe/test_container_failures.py
@@ -14,8 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from os import listdir
-from os.path import join as path_join
 from unittest import main
 from uuid import uuid4

@@ -26,20 +24,13 @@ from swiftclient import client

 from swift.common import direct_client
 from swift.common.exceptions import ClientException
-from swift.common.utils import hash_path, readconf
+from swift.common.utils import readconf
 from test.probe.common import kill_nonprimary_server, \
    kill_server, ReplProbeTest, start_server

 eventlet.monkey_patch(all=False, socket=True)


-def get_db_file_path(obj_dir):
-    files = sorted(listdir(obj_dir), reverse=True)
-    for filename in files:
-        if filename.endswith('db'):
-            return path_join(obj_dir, filename)
-
-
 class TestContainerFailures(ReplProbeTest):

    def test_one_node_fails(self):
@@ -154,23 +145,6 @@ class TestContainerFailures(ReplProbeTest):
            client.put_object(self.url, self.token, container1, 'obj3', 'x')
        self.assertEqual(caught.exception.http_status, 503)

-    def _get_container_db_files(self, container):
-        opart, onodes = self.container_ring.get_nodes(self.account, container)
-        onode = onodes[0]
-        db_files = []
-        for onode in onodes:
-            node_id = (onode['port'] % 100) // 10
-            device = onode['device']
-            hash_str = hash_path(self.account, container)
-            server_conf = readconf(self.configs['container-server'][node_id])
-            devices = server_conf['app:container-server']['devices']
-            obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
-                                                      device, opart,
-                                                      hash_str[-3:], hash_str)
-            db_files.append(get_db_file_path(obj_dir))
-
-        return db_files
-
    def test_locked_container_dbs(self):

        def run_test(num_locks, catch_503):
@@ -179,7 +153,7 @@ class TestContainerFailures(ReplProbeTest):
            # Get the container info into memcache (so no stray
            # get_container_info calls muck up our timings)
            client.get_container(self.url, self.token, container)
-            db_files = self._get_container_db_files(container)
+            db_files = self.get_container_db_files(container)
            db_conns = []
            for i in range(num_locks):
                db_conn = connect(db_files[i])
--- a/test/probe/test_object_failures.py
+++ b/test/probe/test_object_failures.py
@@ -63,7 +63,7 @@ class TestObjectFailures(ReplProbeTest):
        opart, onodes = self.object_ring.get_nodes(
            self.account, container, obj)
        onode = onodes[0]
-        node_id = (onode['port'] % 100) / 10
+        node_id = (onode['port'] % 100) // 10
        device = onode['device']
        hash_str = hash_path(self.account, container, obj)
        obj_server_conf = readconf(self.configs['object-server'][node_id])
--- a/test/probe/test_orphan_container.py
+++ b/test/probe/test_orphan_container.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python -u
+# Copyright (c) 2010-2012 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from swiftclient import client
+from unittest import main
+
+from swift.common.exceptions import LockTimeout
+from swift.common.manager import Manager
+from swift.common.utils import hash_path, readconf, Timestamp
+from swift.container.backend import ContainerBroker
+
+from test.probe.common import (
+    kill_nonprimary_server, kill_server, start_server, ReplProbeTest)
+
+# Why is this not called test_container_orphan? Because the crash
+# happens in the account server, so both account and container
+# services are involved.
+#
+# The common way users do this is to use TripleO to deploy an overcloud
+# and add Gnocchi. Gnocchi is hammering Swift, its container has updates
+# all the time. Then, users crash the overcloud and re-deploy it,
+# using the new suffix in swift.conf. Thereafter, container service
+# inherits old container with outstanding updates, container updater
+# tries to send updates to the account server, while the account cannot
+# be found anymore. In this situation, in Swift 2.25.0, account server
+# tracebacks, and the cycle continues without end.
+
+
+class TestOrphanContainer(ReplProbeTest):
+
+    def get_account_db_files(self, account):
+
+        # This is "more correct" than (port_num%100)//10, but is it worth it?
+        # We have the assumption about port_num vs node_id embedded all over.
+        account_configs = {}
+        for _, cname in self.configs['account-server'].items():
+            conf = readconf(cname)
+            # config parser cannot know if it's a number or not, so int()
+            port = int(conf['app:account-server']['bind_port'])
+            account_configs[port] = conf
+
+        part, nodes = self.account_ring.get_nodes(account)
+        hash_str = hash_path(account)
+
+        ret = []
+        for node in nodes:
+
+            data_dir = 'accounts'
+            device = node['device']
+            conf = account_configs[node['port']]
+            devices = conf['app:account-server']['devices']
+
+            # os.path.join is for the weak
+            db_file = '%s/%s/%s/%s/%s/%s/%s.db' % (
+                devices, device, data_dir, part,
+                hash_str[-3:], hash_str, hash_str)
+            ret.append(db_file)
+        return ret
+
+    def test_update_pending(self):
+
+        # Create container
+        container = 'contx'
+        client.put_container(self.url, self.token, container)
+
+        part, nodes = self.account_ring.get_nodes(self.account)
+        anode = nodes[0]
+
+        # Stop a quorum of account servers
+        # This allows the put to continue later.
+        kill_nonprimary_server(nodes, self.ipport2server)
+        kill_server((anode['ip'], anode['port']), self.ipport2server)
+
+        # Put object
+        # This creates an outstanding update.
+        client.put_object(self.url, self.token, container, 'object1', b'123')
+
+        cont_db_files = self.get_container_db_files(container)
+        self.assertEqual(len(cont_db_files), 3)
+
+        # Collect the observable state from containers
+        outstanding_files = []
+        for cfile in cont_db_files:
+            broker = ContainerBroker(cfile)
+            try:
+                info = broker.get_info()
+            except LockTimeout:
+                self.fail('LockTimeout at %s' % (cfile,))
+            if Timestamp(info['put_timestamp']) <= 0:
+                self.fail('No put_timestamp at %s' % (cfile,))
+            # Correct even if reported_put_timestamp is zero.
+            if info['put_timestamp'] > info['reported_put_timestamp']:
+                outstanding_files.append(cfile)
+        self.assertGreater(len(outstanding_files), 0)
+
+        # At this point the users shut everything down and screw up the
+        # hash in swift.conf. But we destroy the account DB instead.
+        files = self.get_account_db_files(self.account)
+        for afile in files:
+            os.unlink(afile)
+
+        # Restart the stopped primary server
+        start_server((anode['ip'], anode['port']), self.ipport2server)
+
+        # Make sure updaters run
+        Manager(['container-updater']).once()
+
+        # Collect the observable state from containers again and examine it
+        outstanding_files_new = []
+        for cfile in cont_db_files:
+
+            # We aren't catching DatabaseConnectionError, because
+            # we only want to approve of DBs that were quarantined,
+            # and not otherwise damaged. So if the code below throws
+            # an exception for other reason, we want the test to fail.
+            if not os.path.exists(cfile):
+                continue
+
+            broker = ContainerBroker(cfile)
+            try:
+                info = broker.get_info()
+            except LockTimeout:
+                self.fail('LockTimeout at %s' % (cfile,))
+            if Timestamp(info['put_timestamp']) <= 0:
+                self.fail('No put_timestamp at %s' % (cfile,))
+            # Correct even if reported_put_timestamp is zero.
+            if info['put_timestamp'] > info['reported_put_timestamp']:
+                outstanding_files_new.append(cfile)
+        self.assertLengthEqual(outstanding_files_new, 0)
+
+        self.get_to_final_state()
+
+
+if __name__ == '__main__':
+    main()