From b6cb10e48b763e78c9434583b705b95bc9fa2c85 Mon Sep 17 00:00:00 2001 From: Mike Pittaro Date: Fri, 24 Feb 2012 09:56:26 -0800 Subject: [PATCH] Clean stale lockfiles on service startup : fixes bug 785955 Adds cleanup_files_locks() to nova/utils, which cleans up stale locks left behind after process failures. Adds a call to clean up locks on service startup for nova-api, nova-cert, nova-compute, nova-network, nova-objectstore, and nova-scheduler. Adds tools/clean_file_locks.py, which can be used to manually clean stale locks. Change-Id: I752e0b24d3c7fc5f1dc290da355cbd7f430789b8 --- nova/service.py | 2 ++ nova/utils.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/nova/service.py b/nova/service.py index adf242f3d..1da10e697 100644 --- a/nova/service.py +++ b/nova/service.py @@ -158,6 +158,7 @@ class Service(object): vcs_string = version.version_string_with_vcs() LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'), {'topic': self.topic, 'vcs_string': vcs_string}) + utils.cleanup_file_locks() self.manager.init_host() self.model_disconnected = False ctxt = context.get_admin_context() @@ -360,6 +361,7 @@ class WSGIService(object): :returns: None """ + utils.cleanup_file_locks() if self.manager: self.manager.init_host() self.server.start() diff --git a/nova/utils.py b/nova/utils.py index ec62f87fc..0f3e61897 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -26,7 +26,6 @@ import hashlib import inspect import itertools import json -import lockfile import os import pyclbr import random @@ -46,6 +45,7 @@ from eventlet import greenthread from eventlet import semaphore from eventlet.green import subprocess import iso8601 +import lockfile import netaddr from nova import exception @@ -857,6 +857,89 @@ def synchronized(name, external=False): return wrap +def cleanup_file_locks(): + """clean up stale locks left behind by process failures + + The lockfile module, used by @synchronized, can leave stale lockfiles + behind after process failure. These locks can cause process hangs + at startup, when a process deadlocks on a lock which will never + be unlocked. + + Intended to be called at service startup. + + """ + + # NOTE(mikeyp) this routine incorporates some internal knowledge + # from the lockfile module, and this logic really + # should be part of that module. + # + # cleanup logic: + # 1) look for the lockfile modules's 'sentinel' files, of the form + # hostname.[thread-.*]-pid, extract the pid. + # if pid doesn't match a running process, delete the file since + # it's from a dead process. + # 2) check for the actual lockfiles. if lockfile exists with linkcount + # of 1, it's bogus, so delete it. A link count >= 2 indicates that + # there are probably sentinels still linked to it from active + # processes. This check isn't perfect, but there is no way to + # reliably tell which sentinels refer to which lock in the + # lockfile implementation. + + if FLAGS.disable_process_locking: + return + + hostname = socket.gethostname() + sentinel_re = hostname + r'\..*-(\d+$)' + lockfile_re = r'nova-.*\.lock' + files = os.listdir(FLAGS.lock_path) + + # cleanup sentinels + for filename in files: + match = re.match(sentinel_re, filename) + if match is None: + continue + pid = match.group(1) + LOG.debug(_('Found sentinel %(filename)s for pid %(pid)s' % + {'filename': filename, 'pid': pid})) + if not os.path.exists(os.path.join('/proc', pid)): + delete_if_exists(os.path.join(FLAGS.lock_path, filename)) + LOG.debug(_('Cleaned sentinel %(filename)s for pid %(pid)s' % + {'filename': filename, 'pid': pid})) + + # cleanup lock files + for filename in files: + match = re.match(lockfile_re, filename) + if match is None: + continue + try: + stat_info = os.stat(os.path.join(FLAGS.lock_path, filename)) + except OSError as (errno, strerror): + if errno == 2: # doesn't exist + continue + else: + raise + msg = _('Found lockfile %(file)s with link count %(count)d' % + {'file': filename, 'count': stat_info.st_nlink}) + LOG.debug(msg) + if stat_info.st_nlink == 1: + delete_if_exists(os.path.join(FLAGS.lock_path, filename)) + msg = _('Cleaned lockfile %(file)s with link count %(count)d' % + {'file': filename, 'count': stat_info.st_nlink}) + LOG.debug(msg) + + +def delete_if_exists(pathname): + """delete a file, but ignore file not found error""" + + try: + os.unlink(pathname) + except OSError as (errno, strerror): + if errno == 2: # doesn't exist + return + else: + raise + + def get_from_path(items, path): """Returns a list of items matching the specified path.