From ca5505bee9f0ad057078b4743e31f7fb8b801a29 Mon Sep 17 00:00:00 2001 From: Daniel Badea Date: Fri, 31 Aug 2018 14:37:49 +0000 Subject: [PATCH] partition deleted immediately after creation There is a race between agent_audit and applying partitions manifest: 1. agent_audit starts to collect partition information for the current node 2. at the same time there is a request to apply partitions manifest for a new configuration; exec. call for running puppet is not eventlet friendly and blocks the agent including the in-progress audit 3. partitions are created and an update is sent back to conductor that sets new partition status "Ready" 4. agent audit resumes execution and sends an update with the partition info status collected before partitions manifest ran (without the new partition) 5. conductor doesn't find new partition in status update and removes it from the database Add lock around agent_audit() and config_apply_runtime_manifests() to prevent them from running both at the same time. Add lock in manage_partitions's run() and agent's _update_disk_partitions() to prevent agent from running in case partitions manifest is applied by an external trigger (not by sysinv-agent). Refactor common ipartition_update_by_ihost( ..., ipartition_get()) code. Closes-Bug: #1790159 Change-Id: I0c730f9c249810d7eea5e3192c819c498fe30602 --- sysinv/sysinv/sysinv/sysinv/agent/manager.py | 58 +++++++++---------- .../sysinv/sysinv/cmd/manage-partitions | 8 ++- .../sysinv/sysinv/sysinv/common/constants.py | 2 + sysinv/sysinv/sysinv/sysinv/common/utils.py | 19 ++++++ .../sysinv/sysinv/sysinv/conductor/manager.py | 17 +----- 5 files changed, 57 insertions(+), 47 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/agent/manager.py b/sysinv/sysinv/sysinv/sysinv/agent/manager.py index 3eae17cffb..032f4a0e36 100644 --- a/sysinv/sysinv/sysinv/sysinv/agent/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/agent/manager.py @@ -102,6 +102,8 @@ FIRST_BOOT_FLAG = os.path.join( PUPPET_HIERADATA_PATH = os.path.join(tsc.PUPPET_PATH, 'hieradata') +LOCK_AGENT_ACTION = 'agent-exclusive-action' + class FakeGlobalSectionHead(object): def __init__(self, fp): @@ -837,19 +839,8 @@ class AgentManager(service.PeriodicService): LOG.exception("Sysinv Agent exception updating idisk conductor.") pass - ipartition = self._ipartition_operator.ipartition_get() - try: - rpcapi.ipartition_update_by_ihost(icontext, - ihost['uuid'], - ipartition) - except AttributeError: - # safe to ignore during upgrades - LOG.warn("Skip updating ipartition conductor. " - "Upgrade in progress?") - except exception.SysinvException: - LOG.exception("Sysinv Agent exception updating ipartition" - " conductor.") - pass + self._update_disk_partitions(rpcapi, icontext, + ihost['uuid'], force_update=True) ipv = self._ipv_operator.ipv_get() try: @@ -1025,6 +1016,27 @@ class AgentManager(service.PeriodicService): return False return True + @utils.synchronized(constants.PARTITION_MANAGE_LOCK) + def _update_disk_partitions(self, rpcapi, icontext, + host_uuid, force_update=False): + ipartition = self._ipartition_operator.ipartition_get() + if not force_update: + if self._prev_partition == ipartition: + return + self._prev_partition = ipartition + try: + rpcapi.ipartition_update_by_ihost( + icontext, host_uuid, ipartition) + except AttributeError: + # safe to ignore during upgrades + LOG.warn("Skip updating ipartition conductor. " + "Upgrade in progress?") + except exception.SysinvException: + LOG.exception("Sysinv Agent exception updating " + "ipartition conductor.") + if not force_update: + self._prev_partition = None + @periodic_task.periodic_task(spacing=CONF.agent.audit_interval, run_immediately=True) def _agent_audit(self, context): @@ -1032,6 +1044,7 @@ class AgentManager(service.PeriodicService): self.agent_audit(context, host_uuid=self._ihost_uuid, force_updates=None) + @utils.synchronized(LOCK_AGENT_ACTION, external=False) def agent_audit(self, context, host_uuid, force_updates, cinder_device=None): # perform inventory audit if self._ihost_uuid != host_uuid: @@ -1199,23 +1212,7 @@ class AgentManager(service.PeriodicService): # Update disk partitions if self._ihost_personality != constants.STORAGE: - ipartition = self._ipartition_operator.ipartition_get() - if ((self._prev_partition is None) or - (self._prev_partition != ipartition)): - self._prev_partition = ipartition - try: - rpcapi.ipartition_update_by_ihost(icontext, - self._ihost_uuid, - ipartition) - except AttributeError: - # safe to ignore during upgrades - LOG.warn("Skip updating ipartition conductor. " - "Upgrade in progress?") - except exception.SysinvException: - LOG.exception("Sysinv Agent exception updating " - "ipartition conductor.") - self._prev_partition = None - pass + self._update_disk_partitions(rpcapi, icontext, self._ihost_uuid) # Update physical volumes ipv = self._ipv_operator.ipv_get(cinder_device=cinder_device) @@ -1357,6 +1354,7 @@ class AgentManager(service.PeriodicService): self._update_config_applied(iconfig_uuid) self._report_config_applied(context) + @utils.synchronized(LOCK_AGENT_ACTION, external=False) def config_apply_runtime_manifest(self, context, config_uuid, config_dict): """Asynchronously, have the agent apply the runtime manifest with the list of supplied tasks. diff --git a/sysinv/sysinv/sysinv/sysinv/cmd/manage-partitions b/sysinv/sysinv/sysinv/sysinv/cmd/manage-partitions index 6503eac161..e6ff9ab274 100755 --- a/sysinv/sysinv/sysinv/sysinv/cmd/manage-partitions +++ b/sysinv/sysinv/sysinv/sysinv/cmd/manage-partitions @@ -859,6 +859,11 @@ CONF.register_cli_opt( handler=add_action_parsers)) +@utils.synchronized(constants.PARTITION_MANAGE_LOCK) +def run(action, data, mode, pfile): + action(data, mode, pfile) + + def main(argv): sysinv_service.prepare_service(argv) global LOG @@ -873,7 +878,8 @@ def main(argv): "data": CONF.action.data}) LOG.info(msg) print msg - CONF.action.func(CONF.action.data, CONF.action.mode, CONF.action.pfile) + run(CONF.action.func, CONF.action.data, + CONF.action.mode, CONF.action.pfile) else: LOG.error(_("Unknown action: %(action)") % {"action": CONF.action.name}) diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index e3d49b7a4f..6c2e8fe203 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -1125,6 +1125,8 @@ PARTITION_NAME_PV = "LVM Physical Volume" PARTITION_TABLE_GPT = "gpt" PARTITION_TABLE_MSDOS = "msdos" +PARTITION_MANAGE_LOCK = "partition-manage" + # Optional services ALL_OPTIONAL_SERVICES = [SERVICE_TYPE_CINDER, SERVICE_TYPE_MURANO, SERVICE_TYPE_MAGNUM, SERVICE_TYPE_SWIFT, diff --git a/sysinv/sysinv/sysinv/sysinv/common/utils.py b/sysinv/sysinv/sysinv/sysinv/common/utils.py index c38b4aefa1..2f35684f89 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/common/utils.py @@ -31,11 +31,13 @@ import errno import functools import fcntl import glob +import grp import hashlib import itertools as it import json import math import os +import pwd import random import re import shutil @@ -1274,8 +1276,25 @@ def bytes_to_MiB(bytes_number): return bytes_number / float(1024 ** 2) +def check_lock_path(): + if os.path.isdir(constants.SYSINV_LOCK_PATH): + return + try: + uid = pwd.getpwnam(constants.SYSINV_USERNAME).pw_uid + gid = grp.getgrnam(constants.SYSINV_GRPNAME).gr_gid + os.makedirs(constants.SYSINV_LOCK_PATH) + os.chown(constants.SYSINV_LOCK_PATH, uid, gid) + LOG.info("Created directory=%s" % + constants.SYSINV_LOCK_PATH) + + except OSError as e: + LOG.exception("makedir %s OSError=%s encountered" % + (constants.SYSINV_LOCK_PATH, e)) + + def synchronized(name, external=True): if external: + check_lock_path() lock_path = constants.SYSINV_LOCK_PATH else: lock_path = None diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index bfbae00992..d8ad2f4e6d 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -32,11 +32,9 @@ collection of inventory data for each host. import errno import filecmp import glob -import grp import hashlib import httplib import os -import pwd import re import shutil import socket @@ -184,20 +182,7 @@ class ConductorManager(service.PeriodicService): # create /var/run/sysinv if required. On DOR, the manifests # may not run to create this volatile directory. - - if not os.path.isdir(constants.SYSINV_LOCK_PATH): - try: - uid = pwd.getpwnam(constants.SYSINV_USERNAME).pw_uid - gid = grp.getgrnam(constants.SYSINV_GRPNAME).gr_gid - os.makedirs(constants.SYSINV_LOCK_PATH) - os.chown(constants.SYSINV_LOCK_PATH, uid, gid) - LOG.info("Created directory=%s" % - constants.SYSINV_LOCK_PATH) - - except OSError as e: - LOG.exception("makedir %s OSError=%s encountered" % - (constants.SYSINV_LOCK_PATH, e)) - pass + cutils.check_lock_path() system = self._create_default_system()