Band-aid and test the crash of the account server
We have a better fix in the works, see the change Ic53068867feb0c18c88ddbe029af83a970336545. But it is taking too long to coalesce and users are unhappy right now. Related: rhbz#1838242, rhbz#1965348 Change-Id: I3f7bfc2877355b7cb433af77c4e2dfdfa94ff14d
This commit is contained in:
parent
901d2e15b7
commit
bcff1282b5
@ -185,8 +185,9 @@ class AccountController(BaseStorageServer):
|
||||
broker.initialize(timestamp.internal)
|
||||
except DatabaseAlreadyExists:
|
||||
pass
|
||||
if req.headers.get('x-account-override-deleted', 'no').lower() != \
|
||||
'yes' and broker.is_deleted():
|
||||
if (req.headers.get('x-account-override-deleted', 'no').lower() !=
|
||||
'yes' and broker.is_deleted()) \
|
||||
or not os.path.exists(broker.db_file):
|
||||
return HTTPNotFound(request=req)
|
||||
broker.put_container(container, req.headers['x-put-timestamp'],
|
||||
req.headers['x-delete-timestamp'],
|
||||
|
@ -271,9 +271,13 @@ class ContainerUpdater(Daemon):
|
||||
info['storage_policy_index'])
|
||||
for node in nodes]
|
||||
successes = 0
|
||||
stub404s = 0
|
||||
for event in events:
|
||||
if is_success(event.wait()):
|
||||
result = event.wait()
|
||||
if is_success(result):
|
||||
successes += 1
|
||||
if result == 404:
|
||||
stub404s += 1
|
||||
if successes >= majority_size(len(events)):
|
||||
self.logger.increment('successes')
|
||||
self.successes += 1
|
||||
@ -283,6 +287,15 @@ class ContainerUpdater(Daemon):
|
||||
broker.reported(info['put_timestamp'],
|
||||
info['delete_timestamp'], info['object_count'],
|
||||
info['bytes_used'])
|
||||
elif stub404s == len(events):
|
||||
self.logger.increment('failures')
|
||||
self.failures += 1
|
||||
self.logger.debug(
|
||||
_('Update report stub for %(container)s %(dbfile)s'),
|
||||
{'container': container, 'dbfile': dbfile})
|
||||
broker.quarantine('no account replicas exist')
|
||||
# All that's left at this point is a few sacks of Gnocchi,
|
||||
# easily collected by the dark data watcher in object auditor.
|
||||
else:
|
||||
self.logger.increment('failures')
|
||||
self.failures += 1
|
||||
|
@ -36,8 +36,8 @@ from swiftclient import get_auth, head_account, client
|
||||
from swift.common import internal_client, direct_client, utils
|
||||
from swift.common.direct_client import DirectClientException
|
||||
from swift.common.ring import Ring
|
||||
from swift.common.utils import readconf, renamer, \
|
||||
rsync_module_interpolation, md5
|
||||
from swift.common.utils import hash_path, md5, \
|
||||
readconf, renamer, rsync_module_interpolation
|
||||
from swift.common.manager import Manager
|
||||
from swift.common.storage_policy import POLICIES, EC_POLICY, REPL_POLICY
|
||||
from swift.obj.diskfile import get_data_dir
|
||||
@ -582,6 +582,13 @@ class ProbeTest(unittest.TestCase):
|
||||
return daemon
|
||||
|
||||
|
||||
def _get_db_file_path(obj_dir):
|
||||
files = sorted(os.listdir(obj_dir), reverse=True)
|
||||
for filename in files:
|
||||
if filename.endswith('db'):
|
||||
return os.path.join(obj_dir, filename)
|
||||
|
||||
|
||||
class ReplProbeTest(ProbeTest):
|
||||
|
||||
acct_cont_required_replicas = 3
|
||||
@ -625,6 +632,23 @@ class ReplProbeTest(ProbeTest):
|
||||
return self.direct_container_op(direct_client.direct_get_container,
|
||||
account, container, expect_failure)
|
||||
|
||||
def get_container_db_files(self, container):
|
||||
opart, onodes = self.container_ring.get_nodes(self.account, container)
|
||||
onode = onodes[0]
|
||||
db_files = []
|
||||
for onode in onodes:
|
||||
node_id = (onode['port'] % 100) // 10
|
||||
device = onode['device']
|
||||
hash_str = hash_path(self.account, container)
|
||||
server_conf = readconf(self.configs['container-server'][node_id])
|
||||
devices = server_conf['app:container-server']['devices']
|
||||
obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
|
||||
device, opart,
|
||||
hash_str[-3:], hash_str)
|
||||
db_files.append(_get_db_file_path(obj_dir))
|
||||
|
||||
return db_files
|
||||
|
||||
|
||||
class ECProbeTest(ProbeTest):
|
||||
|
||||
|
@ -14,8 +14,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from os import listdir
|
||||
from os.path import join as path_join
|
||||
from unittest import main
|
||||
from uuid import uuid4
|
||||
|
||||
@ -26,20 +24,13 @@ from swiftclient import client
|
||||
|
||||
from swift.common import direct_client
|
||||
from swift.common.exceptions import ClientException
|
||||
from swift.common.utils import hash_path, readconf
|
||||
from swift.common.utils import readconf
|
||||
from test.probe.common import kill_nonprimary_server, \
|
||||
kill_server, ReplProbeTest, start_server
|
||||
|
||||
eventlet.monkey_patch(all=False, socket=True)
|
||||
|
||||
|
||||
def get_db_file_path(obj_dir):
|
||||
files = sorted(listdir(obj_dir), reverse=True)
|
||||
for filename in files:
|
||||
if filename.endswith('db'):
|
||||
return path_join(obj_dir, filename)
|
||||
|
||||
|
||||
class TestContainerFailures(ReplProbeTest):
|
||||
|
||||
def test_one_node_fails(self):
|
||||
@ -154,23 +145,6 @@ class TestContainerFailures(ReplProbeTest):
|
||||
client.put_object(self.url, self.token, container1, 'obj3', 'x')
|
||||
self.assertEqual(caught.exception.http_status, 503)
|
||||
|
||||
def _get_container_db_files(self, container):
|
||||
opart, onodes = self.container_ring.get_nodes(self.account, container)
|
||||
onode = onodes[0]
|
||||
db_files = []
|
||||
for onode in onodes:
|
||||
node_id = (onode['port'] % 100) // 10
|
||||
device = onode['device']
|
||||
hash_str = hash_path(self.account, container)
|
||||
server_conf = readconf(self.configs['container-server'][node_id])
|
||||
devices = server_conf['app:container-server']['devices']
|
||||
obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
|
||||
device, opart,
|
||||
hash_str[-3:], hash_str)
|
||||
db_files.append(get_db_file_path(obj_dir))
|
||||
|
||||
return db_files
|
||||
|
||||
def test_locked_container_dbs(self):
|
||||
|
||||
def run_test(num_locks, catch_503):
|
||||
@ -179,7 +153,7 @@ class TestContainerFailures(ReplProbeTest):
|
||||
# Get the container info into memcache (so no stray
|
||||
# get_container_info calls muck up our timings)
|
||||
client.get_container(self.url, self.token, container)
|
||||
db_files = self._get_container_db_files(container)
|
||||
db_files = self.get_container_db_files(container)
|
||||
db_conns = []
|
||||
for i in range(num_locks):
|
||||
db_conn = connect(db_files[i])
|
||||
|
@ -63,7 +63,7 @@ class TestObjectFailures(ReplProbeTest):
|
||||
opart, onodes = self.object_ring.get_nodes(
|
||||
self.account, container, obj)
|
||||
onode = onodes[0]
|
||||
node_id = (onode['port'] % 100) / 10
|
||||
node_id = (onode['port'] % 100) // 10
|
||||
device = onode['device']
|
||||
hash_str = hash_path(self.account, container, obj)
|
||||
obj_server_conf = readconf(self.configs['object-server'][node_id])
|
||||
|
150
test/probe/test_orphan_container.py
Normal file
150
test/probe/test_orphan_container.py
Normal file
@ -0,0 +1,150 @@
|
||||
#!/usr/bin/python -u
|
||||
# Copyright (c) 2010-2012 OpenStack Foundation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
from swiftclient import client
|
||||
from unittest import main
|
||||
|
||||
from swift.common.exceptions import LockTimeout
|
||||
from swift.common.manager import Manager
|
||||
from swift.common.utils import hash_path, readconf, Timestamp
|
||||
from swift.container.backend import ContainerBroker
|
||||
|
||||
from test.probe.common import (
|
||||
kill_nonprimary_server, kill_server, start_server, ReplProbeTest)
|
||||
|
||||
# Why is this not called test_container_orphan? Because the crash
|
||||
# happens in the account server, so both account and container
|
||||
# services are involved.
|
||||
#
|
||||
# The common way users do this is to use TripleO to deploy an overcloud
|
||||
# and add Gnocchi. Gnocchi is hammering Swift, its container has updates
|
||||
# all the time. Then, users crash the overcloud and re-deploy it,
|
||||
# using the new suffix in swift.conf. Thereafter, container service
|
||||
# inherits old container with outstanding updates, container updater
|
||||
# tries to send updates to the account server, while the account cannot
|
||||
# be found anymore. In this situation, in Swift 2.25.0, account server
|
||||
# tracebacks, and the cycle continues without end.
|
||||
|
||||
|
||||
class TestOrphanContainer(ReplProbeTest):
|
||||
|
||||
def get_account_db_files(self, account):
|
||||
|
||||
# This is "more correct" than (port_num%100)//10, but is it worth it?
|
||||
# We have the assumption about port_num vs node_id embedded all over.
|
||||
account_configs = {}
|
||||
for _, cname in self.configs['account-server'].items():
|
||||
conf = readconf(cname)
|
||||
# config parser cannot know if it's a number or not, so int()
|
||||
port = int(conf['app:account-server']['bind_port'])
|
||||
account_configs[port] = conf
|
||||
|
||||
part, nodes = self.account_ring.get_nodes(account)
|
||||
hash_str = hash_path(account)
|
||||
|
||||
ret = []
|
||||
for node in nodes:
|
||||
|
||||
data_dir = 'accounts'
|
||||
device = node['device']
|
||||
conf = account_configs[node['port']]
|
||||
devices = conf['app:account-server']['devices']
|
||||
|
||||
# os.path.join is for the weak
|
||||
db_file = '%s/%s/%s/%s/%s/%s/%s.db' % (
|
||||
devices, device, data_dir, part,
|
||||
hash_str[-3:], hash_str, hash_str)
|
||||
ret.append(db_file)
|
||||
return ret
|
||||
|
||||
def test_update_pending(self):
|
||||
|
||||
# Create container
|
||||
container = 'contx'
|
||||
client.put_container(self.url, self.token, container)
|
||||
|
||||
part, nodes = self.account_ring.get_nodes(self.account)
|
||||
anode = nodes[0]
|
||||
|
||||
# Stop a quorum of account servers
|
||||
# This allows the put to continue later.
|
||||
kill_nonprimary_server(nodes, self.ipport2server)
|
||||
kill_server((anode['ip'], anode['port']), self.ipport2server)
|
||||
|
||||
# Put object
|
||||
# This creates an outstanding update.
|
||||
client.put_object(self.url, self.token, container, 'object1', b'123')
|
||||
|
||||
cont_db_files = self.get_container_db_files(container)
|
||||
self.assertEqual(len(cont_db_files), 3)
|
||||
|
||||
# Collect the observable state from containers
|
||||
outstanding_files = []
|
||||
for cfile in cont_db_files:
|
||||
broker = ContainerBroker(cfile)
|
||||
try:
|
||||
info = broker.get_info()
|
||||
except LockTimeout:
|
||||
self.fail('LockTimeout at %s' % (cfile,))
|
||||
if Timestamp(info['put_timestamp']) <= 0:
|
||||
self.fail('No put_timestamp at %s' % (cfile,))
|
||||
# Correct even if reported_put_timestamp is zero.
|
||||
if info['put_timestamp'] > info['reported_put_timestamp']:
|
||||
outstanding_files.append(cfile)
|
||||
self.assertGreater(len(outstanding_files), 0)
|
||||
|
||||
# At this point the users shut everything down and screw up the
|
||||
# hash in swift.conf. But we destroy the account DB instead.
|
||||
files = self.get_account_db_files(self.account)
|
||||
for afile in files:
|
||||
os.unlink(afile)
|
||||
|
||||
# Restart the stopped primary server
|
||||
start_server((anode['ip'], anode['port']), self.ipport2server)
|
||||
|
||||
# Make sure updaters run
|
||||
Manager(['container-updater']).once()
|
||||
|
||||
# Collect the observable state from containers again and examine it
|
||||
outstanding_files_new = []
|
||||
for cfile in cont_db_files:
|
||||
|
||||
# We aren't catching DatabaseConnectionError, because
|
||||
# we only want to approve of DBs that were quarantined,
|
||||
# and not otherwise damaged. So if the code below throws
|
||||
# an exception for other reason, we want the test to fail.
|
||||
if not os.path.exists(cfile):
|
||||
continue
|
||||
|
||||
broker = ContainerBroker(cfile)
|
||||
try:
|
||||
info = broker.get_info()
|
||||
except LockTimeout:
|
||||
self.fail('LockTimeout at %s' % (cfile,))
|
||||
if Timestamp(info['put_timestamp']) <= 0:
|
||||
self.fail('No put_timestamp at %s' % (cfile,))
|
||||
# Correct even if reported_put_timestamp is zero.
|
||||
if info['put_timestamp'] > info['reported_put_timestamp']:
|
||||
outstanding_files_new.append(cfile)
|
||||
self.assertLengthEqual(outstanding_files_new, 0)
|
||||
|
||||
self.get_to_final_state()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user