Merge "Speed up reaper for a big account delete and some better error handling"

This commit is contained in:
Jenkins
2015-08-11 11:22:19 +00:00
committed by Gerrit Code Review
2 changed files with 126 additions and 23 deletions

View File

@@ -15,10 +15,12 @@
import os import os
import random import random
import socket
from swift import gettext_ as _ from swift import gettext_ as _
from logging import DEBUG from logging import DEBUG
from math import sqrt from math import sqrt
from time import time from time import time
from hashlib import md5
import itertools import itertools
from eventlet import GreenPool, sleep, Timeout from eventlet import GreenPool, sleep, Timeout
@@ -70,6 +72,7 @@ class AccountReaper(Daemon):
self.node_timeout = int(conf.get('node_timeout', 10)) self.node_timeout = int(conf.get('node_timeout', 10))
self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.conn_timeout = float(conf.get('conn_timeout', 0.5))
self.myips = whataremyips(conf.get('bind_ip', '0.0.0.0')) self.myips = whataremyips(conf.get('bind_ip', '0.0.0.0'))
self.bind_port = int(conf.get('bind_port', 0))
self.concurrency = int(conf.get('concurrency', 25)) self.concurrency = int(conf.get('concurrency', 25))
self.container_concurrency = self.object_concurrency = \ self.container_concurrency = self.object_concurrency = \
sqrt(self.concurrency) sqrt(self.concurrency)
@@ -79,6 +82,7 @@ class AccountReaper(Daemon):
self.delay_reaping = int(conf.get('delay_reaping') or 0) self.delay_reaping = int(conf.get('delay_reaping') or 0)
reap_warn_after = float(conf.get('reap_warn_after') or 86400 * 30) reap_warn_after = float(conf.get('reap_warn_after') or 86400 * 30)
self.reap_not_done_after = reap_warn_after + self.delay_reaping self.reap_not_done_after = reap_warn_after + self.delay_reaping
self.start_time = time()
def get_account_ring(self): def get_account_ring(self):
"""The account :class:`swift.common.ring.Ring` for the cluster.""" """The account :class:`swift.common.ring.Ring` for the cluster."""
@@ -161,9 +165,16 @@ class AccountReaper(Daemon):
if not partition.isdigit(): if not partition.isdigit():
continue continue
nodes = self.get_account_ring().get_part_nodes(int(partition)) nodes = self.get_account_ring().get_part_nodes(int(partition))
if (not is_local_device(self.myips, None, nodes[0]['ip'], None) if not os.path.isdir(partition_path):
or not os.path.isdir(partition_path)):
continue continue
container_shard = None
for container_shard, node in enumerate(nodes):
if is_local_device(self.myips, None, node['ip'], None) and \
(not self.bind_port or self.bind_port == node['port']):
break
else:
continue
for suffix in os.listdir(partition_path): for suffix in os.listdir(partition_path):
suffix_path = os.path.join(partition_path, suffix) suffix_path = os.path.join(partition_path, suffix)
if not os.path.isdir(suffix_path): if not os.path.isdir(suffix_path):
@@ -181,7 +192,9 @@ class AccountReaper(Daemon):
AccountBroker(os.path.join(hsh_path, fname)) AccountBroker(os.path.join(hsh_path, fname))
if broker.is_status_deleted() and \ if broker.is_status_deleted() and \
not broker.empty(): not broker.empty():
self.reap_account(broker, partition, nodes) self.reap_account(
broker, partition, nodes,
container_shard=container_shard)
def reset_stats(self): def reset_stats(self):
self.stats_return_codes = {} self.stats_return_codes = {}
@@ -192,7 +205,7 @@ class AccountReaper(Daemon):
self.stats_containers_possibly_remaining = 0 self.stats_containers_possibly_remaining = 0
self.stats_objects_possibly_remaining = 0 self.stats_objects_possibly_remaining = 0
def reap_account(self, broker, partition, nodes): def reap_account(self, broker, partition, nodes, container_shard=None):
""" """
Called once per pass for each account this server is the primary for Called once per pass for each account this server is the primary for
and attempts to delete the data for the given account. The reaper will and attempts to delete the data for the given account. The reaper will
@@ -219,6 +232,8 @@ class AccountReaper(Daemon):
:param broker: The AccountBroker for the account to delete. :param broker: The AccountBroker for the account to delete.
:param partition: The partition in the account ring the account is on. :param partition: The partition in the account ring the account is on.
:param nodes: The primary node dicts for the account to delete. :param nodes: The primary node dicts for the account to delete.
:param container_shard: int used to shard containers reaped. If None,
will reap all containers.
.. seealso:: .. seealso::
@@ -237,16 +252,24 @@ class AccountReaper(Daemon):
account = info['account'] account = info['account']
self.logger.info(_('Beginning pass on account %s'), account) self.logger.info(_('Beginning pass on account %s'), account)
self.reset_stats() self.reset_stats()
container_limit = 1000
if container_shard is not None:
container_limit *= len(nodes)
try: try:
marker = '' marker = ''
while True: while True:
containers = \ containers = \
list(broker.list_containers_iter(1000, marker, None, None, list(broker.list_containers_iter(container_limit, marker,
None)) None, None, None))
if not containers: if not containers:
break break
try: try:
for (container, _junk, _junk, _junk) in containers: for (container, _junk, _junk, _junk) in containers:
this_shard = int(md5(container).hexdigest(), 16) % \
len(nodes)
if container_shard not in (this_shard, None):
continue
self.container_pool.spawn(self.reap_container, account, self.container_pool.spawn(self.reap_container, account,
partition, nodes, container) partition, nodes, container)
self.container_pool.waitall() self.container_pool.waitall()
@@ -351,6 +374,10 @@ class AccountReaper(Daemon):
self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.stats_return_codes.get(err.http_status / 100, 0) + 1
self.logger.increment( self.logger.increment(
'return_codes.%d' % (err.http_status / 100,)) 'return_codes.%d' % (err.http_status / 100,))
except (Timeout, socket.error) as err:
self.logger.error(
_('Timeout Exception with %(ip)s:%(port)s/%(device)s'),
node)
if not objects: if not objects:
break break
try: try:
@@ -403,6 +430,12 @@ class AccountReaper(Daemon):
self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.stats_return_codes.get(err.http_status / 100, 0) + 1
self.logger.increment( self.logger.increment(
'return_codes.%d' % (err.http_status / 100,)) 'return_codes.%d' % (err.http_status / 100,))
except (Timeout, socket.error) as err:
self.logger.error(
_('Timeout Exception with %(ip)s:%(port)s/%(device)s'),
node)
failures += 1
self.logger.increment('containers_failures')
if successes > failures: if successes > failures:
self.stats_containers_deleted += 1 self.stats_containers_deleted += 1
self.logger.increment('containers_deleted') self.logger.increment('containers_deleted')
@@ -473,6 +506,12 @@ class AccountReaper(Daemon):
self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.stats_return_codes.get(err.http_status / 100, 0) + 1
self.logger.increment( self.logger.increment(
'return_codes.%d' % (err.http_status / 100,)) 'return_codes.%d' % (err.http_status / 100,))
except (Timeout, socket.error) as err:
failures += 1
self.logger.increment('objects_failures')
self.logger.error(
_('Timeout Exception with %(ip)s:%(port)s/%(device)s'),
node)
if successes > failures: if successes > failures:
self.stats_objects_deleted += 1 self.stats_objects_deleted += 1
self.logger.increment('objects_deleted') self.logger.increment('objects_deleted')

View File

@@ -77,6 +77,7 @@ class FakeBroker(object):
class FakeAccountBroker(object): class FakeAccountBroker(object):
def __init__(self, containers): def __init__(self, containers):
self.containers = containers self.containers = containers
self.containers_yielded = []
def get_info(self): def get_info(self):
info = {'account': 'a', info = {'account': 'a',
@@ -101,11 +102,11 @@ class FakeRing(object):
'port': 6002, 'port': 6002,
'device': None}, 'device': None},
{'id': '2', {'id': '2',
'ip': '10.10.10.1', 'ip': '10.10.10.2',
'port': 6002, 'port': 6002,
'device': None}, 'device': None},
{'id': '3', {'id': '3',
'ip': '10.10.10.1', 'ip': '10.10.10.3',
'port': 6002, 'port': 6002,
'device': None}, 'device': None},
] ]
@@ -504,24 +505,26 @@ class TestReaper(unittest.TestCase):
self.called_amount = 0 self.called_amount = 0
self.r = r = self.init_reaper({}, fakelogger=True) self.r = r = self.init_reaper({}, fakelogger=True)
r.start_time = time.time() r.start_time = time.time()
ctx = [patch('swift.account.reaper.AccountReaper.reap_container', with patch('swift.account.reaper.AccountReaper.reap_container',
self.fake_reap_container), self.fake_reap_container), \
patch('swift.account.reaper.AccountReaper.get_account_ring', patch('swift.account.reaper.AccountReaper.get_account_ring',
self.fake_account_ring)] self.fake_account_ring):
with nested(*ctx):
nodes = r.get_account_ring().get_part_nodes() nodes = r.get_account_ring().get_part_nodes()
self.assertTrue(r.reap_account(broker, 'partition', nodes)) for container_shard, node in enumerate(nodes):
self.assertTrue(
r.reap_account(broker, 'partition', nodes,
container_shard=container_shard))
self.assertEqual(self.called_amount, 4) self.assertEqual(self.called_amount, 4)
info_lines = r.logger.get_lines_for_level('info') info_lines = r.logger.get_lines_for_level('info')
self.assertEqual(len(info_lines), 2) self.assertEqual(len(info_lines), 6)
start_line, stat_line = info_lines for start_line, stat_line in zip(*[iter(info_lines)] * 2):
self.assertEqual(start_line, 'Beginning pass on account a') self.assertEqual(start_line, 'Beginning pass on account a')
self.assertTrue(stat_line.find('1 containers deleted')) self.assertTrue(stat_line.find('1 containers deleted'))
self.assertTrue(stat_line.find('1 objects deleted')) self.assertTrue(stat_line.find('1 objects deleted'))
self.assertTrue(stat_line.find('1 containers remaining')) self.assertTrue(stat_line.find('1 containers remaining'))
self.assertTrue(stat_line.find('1 objects remaining')) self.assertTrue(stat_line.find('1 objects remaining'))
self.assertTrue(stat_line.find('1 containers possibly remaining')) self.assertTrue(stat_line.find('1 containers possibly remaining'))
self.assertTrue(stat_line.find('1 objects possibly remaining')) self.assertTrue(stat_line.find('1 objects possibly remaining'))
def test_reap_account_no_container(self): def test_reap_account_no_container(self):
broker = FakeAccountBroker(tuple()) broker = FakeAccountBroker(tuple())
@@ -584,6 +587,67 @@ class TestReaper(unittest.TestCase):
r.reap_device('sda1') r.reap_device('sda1')
self.assertEqual(self.called_amount, 0) self.assertEqual(self.called_amount, 0)
def test_reap_device_with_sharding(self):
devices = self.prepare_data_dir()
conf = {'devices': devices}
r = self.init_reaper(conf, myips=['10.10.10.2'])
container_shard_used = [-1]
def fake_reap_account(*args, **kwargs):
container_shard_used[0] = kwargs.get('container_shard')
with patch('swift.account.reaper.AccountBroker',
FakeAccountBroker), \
patch('swift.account.reaper.AccountReaper.get_account_ring',
self.fake_account_ring), \
patch('swift.account.reaper.AccountReaper.reap_account',
fake_reap_account):
r.reap_device('sda1')
# 10.10.10.2 is second node from ring
self.assertEqual(container_shard_used[0], 1)
def test_reap_account_with_sharding(self):
devices = self.prepare_data_dir()
self.called_amount = 0
conf = {'devices': devices}
r = self.init_reaper(conf, myips=['10.10.10.2'])
container_reaped = [0]
def fake_list_containers_iter(self, *args):
for container in self.containers:
if container in self.containers_yielded:
continue
yield container, None, None, None
self.containers_yielded.append(container)
def fake_reap_container(self, account, account_partition,
account_nodes, container):
container_reaped[0] += 1
ctx = [patch('swift.account.reaper.AccountBroker',
FakeAccountBroker),
patch('swift.account.reaper.AccountBroker.list_containers_iter',
fake_list_containers_iter),
patch('swift.account.reaper.AccountReaper.reap_container',
fake_reap_container), ]
fake_ring = FakeRing()
with nested(*ctx):
fake_broker = FakeAccountBroker(['c', 'd', 'e'])
r.reap_account(fake_broker, 10, fake_ring.nodes, 0)
self.assertEqual(container_reaped[0], 1)
fake_broker = FakeAccountBroker(['c', 'd', 'e'])
container_reaped[0] = 0
r.reap_account(fake_broker, 10, fake_ring.nodes, 1)
self.assertEqual(container_reaped[0], 2)
container_reaped[0] = 0
fake_broker = FakeAccountBroker(['c', 'd', 'e'])
r.reap_account(fake_broker, 10, fake_ring.nodes, 2)
self.assertEqual(container_reaped[0], 0)
def test_run_once(self): def test_run_once(self):
def prepare_data_dir(): def prepare_data_dir():
devices_path = tempfile.mkdtemp() devices_path = tempfile.mkdtemp()