Clean stale lockfiles on service startup : fixes bug 785955

Adds cleanup_files_locks() to nova/utils, which cleans up stale locks
left behind after process failures.

Adds a call to clean up locks on service startup for nova-api, nova-cert,
nova-compute, nova-network, nova-objectstore, and nova-scheduler.

Adds tools/clean_file_locks.py, which can be used to manually clean
stale locks.

Change-Id: I752e0b24d3c7fc5f1dc290da355cbd7f430789b8
This commit is contained in:
Mike Pittaro
2012-02-24 09:56:26 -08:00
parent 92e90c39fc
commit b6cb10e48b
2 changed files with 86 additions and 1 deletions

View File

@@ -158,6 +158,7 @@ class Service(object):
vcs_string = version.version_string_with_vcs()
LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'),
{'topic': self.topic, 'vcs_string': vcs_string})
utils.cleanup_file_locks()
self.manager.init_host()
self.model_disconnected = False
ctxt = context.get_admin_context()
@@ -360,6 +361,7 @@ class WSGIService(object):
:returns: None
"""
utils.cleanup_file_locks()
if self.manager:
self.manager.init_host()
self.server.start()

View File

@@ -26,7 +26,6 @@ import hashlib
import inspect
import itertools
import json
import lockfile
import os
import pyclbr
import random
@@ -46,6 +45,7 @@ from eventlet import greenthread
from eventlet import semaphore
from eventlet.green import subprocess
import iso8601
import lockfile
import netaddr
from nova import exception
@@ -857,6 +857,89 @@ def synchronized(name, external=False):
return wrap
def cleanup_file_locks():
"""clean up stale locks left behind by process failures
The lockfile module, used by @synchronized, can leave stale lockfiles
behind after process failure. These locks can cause process hangs
at startup, when a process deadlocks on a lock which will never
be unlocked.
Intended to be called at service startup.
"""
# NOTE(mikeyp) this routine incorporates some internal knowledge
# from the lockfile module, and this logic really
# should be part of that module.
#
# cleanup logic:
# 1) look for the lockfile modules's 'sentinel' files, of the form
# hostname.[thread-.*]-pid, extract the pid.
# if pid doesn't match a running process, delete the file since
# it's from a dead process.
# 2) check for the actual lockfiles. if lockfile exists with linkcount
# of 1, it's bogus, so delete it. A link count >= 2 indicates that
# there are probably sentinels still linked to it from active
# processes. This check isn't perfect, but there is no way to
# reliably tell which sentinels refer to which lock in the
# lockfile implementation.
if FLAGS.disable_process_locking:
return
hostname = socket.gethostname()
sentinel_re = hostname + r'\..*-(\d+$)'
lockfile_re = r'nova-.*\.lock'
files = os.listdir(FLAGS.lock_path)
# cleanup sentinels
for filename in files:
match = re.match(sentinel_re, filename)
if match is None:
continue
pid = match.group(1)
LOG.debug(_('Found sentinel %(filename)s for pid %(pid)s' %
{'filename': filename, 'pid': pid}))
if not os.path.exists(os.path.join('/proc', pid)):
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
LOG.debug(_('Cleaned sentinel %(filename)s for pid %(pid)s' %
{'filename': filename, 'pid': pid}))
# cleanup lock files
for filename in files:
match = re.match(lockfile_re, filename)
if match is None:
continue
try:
stat_info = os.stat(os.path.join(FLAGS.lock_path, filename))
except OSError as (errno, strerror):
if errno == 2: # doesn't exist
continue
else:
raise
msg = _('Found lockfile %(file)s with link count %(count)d' %
{'file': filename, 'count': stat_info.st_nlink})
LOG.debug(msg)
if stat_info.st_nlink == 1:
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
msg = _('Cleaned lockfile %(file)s with link count %(count)d' %
{'file': filename, 'count': stat_info.st_nlink})
LOG.debug(msg)
def delete_if_exists(pathname):
"""delete a file, but ignore file not found error"""
try:
os.unlink(pathname)
except OSError as (errno, strerror):
if errno == 2: # doesn't exist
return
else:
raise
def get_from_path(items, path):
"""Returns a list of items matching the specified path.