container-updater: temporary account update suppression on errors

This commit is contained in:
gholt 2011-01-26 00:11:49 +00:00 committed by Tarmac
commit edb4e90ebb
4 changed files with 69 additions and 19 deletions

View File

@ -371,19 +371,25 @@ reclaim_age 604800 Time elapsed in seconds before a
[container-updater]
================== ================= =======================================
Option Default Description
------------------ ----------------- ---------------------------------------
log_name container-updater Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
interval 300 Minimum time for a pass to take
concurrency 4 Number of updater workers to spawn
node_timeout 3 Request timeout to external services
conn_timeout 0.5 Connection timeout to external services
slowdown 0.01 Time in seconds to wait between
containers
================== ================= =======================================
======================== ================= ==================================
Option Default Description
------------------------ ----------------- ----------------------------------
log_name container-updater Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
interval 300 Minimum time for a pass to take
concurrency 4 Number of updater workers to spawn
node_timeout 3 Request timeout to external
services
conn_timeout 0.5 Connection timeout to external
services
slowdown 0.01 Time in seconds to wait between
containers
account_suppression_time 60 Seconds to suppress updating an
account that has generated an
error (timeout, not yet found,
etc.)
======================== ================= ==================================
[container-auditor]

View File

@ -50,6 +50,8 @@ use = egg:swift#container
# conn_timeout = 0.5
# slowdown will sleep that amount between containers
# slowdown = 0.01
# Seconds to suppress updating an account that has generated an error
# account_suppression_time = 60
[container-auditor]
# You can override the default log routing for this app here (don't use set!):

View File

@ -19,6 +19,7 @@ import signal
import sys
import time
from random import random, shuffle
from tempfile import mkstemp
from eventlet import spawn, patcher, Timeout
@ -51,6 +52,10 @@ class ContainerUpdater(Daemon):
self.no_changes = 0
self.successes = 0
self.failures = 0
self.account_suppressions = {}
self.account_suppression_time = \
float(conf.get('account_suppression_time', 60))
self.new_account_suppressions = None
def get_account_ring(self):
"""Get the account ring. Load it if it hasn't been yet."""
@ -80,6 +85,19 @@ class ContainerUpdater(Daemon):
shuffle(paths)
return paths
def _load_suppressions(self, filename):
try:
with open(filename, 'r') as tmpfile:
for line in tmpfile:
account, until = line.split()
until = float(until)
self.account_suppressions[account] = until
except:
self.logger.exception(
_('ERROR with loading suppressions from %s: ') % filename)
finally:
os.unlink(filename)
def run_forever(self): # pragma: no cover
"""
Run the updator continuously.
@ -88,21 +106,33 @@ class ContainerUpdater(Daemon):
while True:
self.logger.info(_('Begin container update sweep'))
begin = time.time()
pids = []
now = time.time()
expired_suppressions = \
[a for a, u in self.account_suppressions.iteritems() if u < now]
for account in expired_suppressions:
del self.account_suppressions[account]
pid2filename = {}
# read from account ring to ensure it's fresh
self.get_account_ring().get_nodes('')
for path in self.get_paths():
while len(pids) >= self.concurrency:
pids.remove(os.wait()[0])
while len(pid2filename) >= self.concurrency:
pid = os.wait()[0]
try:
self._load_suppressions(pid2filename[pid])
finally:
del pid2filename[pid]
fd, tmpfilename = mkstemp()
os.close(fd)
pid = os.fork()
if pid:
pids.append(pid)
pid2filename[pid] = tmpfilename
else:
signal.signal(signal.SIGTERM, signal.SIG_DFL)
patcher.monkey_patch(all=False, socket=True)
self.no_changes = 0
self.successes = 0
self.failures = 0
self.new_account_suppressions = open(tmpfilename, 'w')
forkbegin = time.time()
self.container_sweep(path)
elapsed = time.time() - forkbegin
@ -114,8 +144,12 @@ class ContainerUpdater(Daemon):
'success': self.successes, 'fail': self.failures,
'no_change': self.no_changes})
sys.exit()
while pids:
pids.remove(os.wait()[0])
while pid2filename:
pid = os.wait()[0]
try:
self._load_suppressions(pid2filename[pid])
finally:
del pid2filename[pid]
elapsed = time.time() - begin
self.logger.info(_('Container update sweep completed: %.02fs'),
elapsed)
@ -165,6 +199,8 @@ class ContainerUpdater(Daemon):
# definitely doesn't have up to date statistics.
if float(info['put_timestamp']) <= 0:
return
if self.account_suppressions.get(info['account'], 0) > time.time():
return
if info['put_timestamp'] > info['reported_put_timestamp'] or \
info['delete_timestamp'] > info['reported_delete_timestamp'] \
or info['object_count'] != info['reported_object_count'] or \
@ -195,6 +231,11 @@ class ContainerUpdater(Daemon):
self.logger.debug(
_('Update report failed for %(container)s %(dbfile)s'),
{'container': container, 'dbfile': dbfile})
self.account_suppressions[info['account']] = until = \
time.time() + self.account_suppression_time
if self.new_account_suppressions:
print >>self.new_account_suppressions, \
info['account'], until
else:
self.no_changes += 1

View File

@ -78,6 +78,7 @@ class TestContainerUpdater(unittest.TestCase):
'interval': '1',
'concurrency': '1',
'node_timeout': '15',
'account_suppression_time': 0
})
cu.run_once()
containers_dir = os.path.join(self.sda1, container_server.DATADIR)