swift/test/probe/test_orphan_container.py

#!/usr/bin/python -u
# Copyright (c) 2010-2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from swiftclient import client
from unittest import main

from swift.common.exceptions import LockTimeout
from swift.common.manager import Manager
from swift.common.utils import hash_path, readconf, Timestamp
from swift.container.backend import ContainerBroker

from test.probe.common import (
    kill_nonprimary_server, kill_server, start_server, ReplProbeTest)

# Why is this not called test_container_orphan? Because the crash
# happens in the account server, so both account and container
# services are involved.
#
# The common way users do this is to use TripleO to deploy an overcloud
# and add Gnocchi. Gnocchi is hammering Swift, its container has updates
# all the time. Then, users crash the overcloud and re-deploy it,
# using the new suffix in swift.conf. Thereafter, container service
# inherits old container with outstanding updates, container updater
# tries to send updates to the account server, while the account cannot
# be found anymore. In this situation, in Swift 2.25.0, account server
# tracebacks, and the cycle continues without end.


class TestOrphanContainer(ReplProbeTest):

    def get_account_db_files(self, account):

        # This is "more correct" than (port_num%100)//10, but is it worth it?
        # We have the assumption about port_num vs node_id embedded all over.
        account_configs = {}
        for _, cname in self.configs['account-server'].items():
            conf = readconf(cname)
            # config parser cannot know if it's a number or not, so int()
            port = int(conf['app:account-server']['bind_port'])
            account_configs[port] = conf

        part, nodes = self.account_ring.get_nodes(account)
        hash_str = hash_path(account)

        ret = []
        for node in nodes:

            data_dir = 'accounts'
            device = node['device']
            conf = account_configs[node['port']]
            devices = conf['app:account-server']['devices']

            # os.path.join is for the weak
            db_file = '%s/%s/%s/%s/%s/%s/%s.db' % (
                devices, device, data_dir, part,
                hash_str[-3:], hash_str, hash_str)
            ret.append(db_file)
        return ret

    def test_update_pending(self):

        # Create container
        container = 'contx'
        client.put_container(self.url, self.token, container)

        part, nodes = self.account_ring.get_nodes(self.account)
        anode = nodes[0]

        # Stop a quorum of account servers
        # This allows the put to continue later.
        kill_nonprimary_server(nodes, self.ipport2server)
        kill_server((anode['ip'], anode['port']), self.ipport2server)

        # Put object
        # This creates an outstanding update.
        client.put_object(self.url, self.token, container, 'object1', b'123')

        cont_db_files = self.get_container_db_files(container)
        self.assertEqual(len(cont_db_files), 3)

        # Collect the observable state from containers
        outstanding_files = []
        for cfile in cont_db_files:
            broker = ContainerBroker(cfile)
            try:
                info = broker.get_info()
            except LockTimeout:
                self.fail('LockTimeout at %s' % (cfile,))
            if Timestamp(info['put_timestamp']) <= 0:
                self.fail('No put_timestamp at %s' % (cfile,))
            # Correct even if reported_put_timestamp is zero.
            if info['put_timestamp'] > info['reported_put_timestamp']:
                outstanding_files.append(cfile)
        self.assertGreater(len(outstanding_files), 0)

        # At this point the users shut everything down and screw up the
        # hash in swift.conf. But we destroy the account DB instead.
        files = self.get_account_db_files(self.account)
        for afile in files:
            os.unlink(afile)

        # Restart the stopped primary server
        start_server((anode['ip'], anode['port']), self.ipport2server)

        # Make sure updaters run
        Manager(['container-updater']).once()

        # Collect the observable state from containers again and examine it
        outstanding_files_new = []
        for cfile in cont_db_files:

            # We aren't catching DatabaseConnectionError, because
            # we only want to approve of DBs that were quarantined,
            # and not otherwise damaged. So if the code below throws
            # an exception for other reason, we want the test to fail.
            if not os.path.exists(cfile):
                continue

            broker = ContainerBroker(cfile)
            try:
                info = broker.get_info()
            except LockTimeout:
                self.fail('LockTimeout at %s' % (cfile,))
            if Timestamp(info['put_timestamp']) <= 0:
                self.fail('No put_timestamp at %s' % (cfile,))
            # Correct even if reported_put_timestamp is zero.
            if info['put_timestamp'] > info['reported_put_timestamp']:
                outstanding_files_new.append(cfile)
        self.assertLengthEqual(outstanding_files_new, 0)

        self.get_to_final_state()


if __name__ == '__main__':
    main()