partition deleted immediately after creation

There is a race between agent_audit and applying partitions manifest:
1. agent_audit starts to collect partition information for the current node
2. at the same time there is a request to apply partitions manifest for a
   new configuration; exec. call for running puppet is not eventlet friendly
   and blocks the agent including the in-progress audit
3. partitions are created and an update is sent back to conductor that sets
   new partition status "Ready"
4. agent audit resumes execution and sends an update with the partition info
   status collected before partitions manifest ran (without the new partition)
5. conductor doesn't find new partition in status update and removes it
   from the database

Add lock around agent_audit() and config_apply_runtime_manifests() to
prevent them from running both at the same time.

Add lock in manage_partitions's run() and agent's _update_disk_partitions()
to prevent agent from running in case partitions manifest is applied by
an external trigger (not by sysinv-agent).

Refactor common ipartition_update_by_ihost( ..., ipartition_get()) code.

Closes-Bug: #1790159
Change-Id: I0c730f9c249810d7eea5e3192c819c498fe30602
This commit is contained in:
Daniel Badea 2018-08-31 14:37:49 +00:00
parent f19dd0498f
commit ca5505bee9
5 changed files with 57 additions and 47 deletions

View File

@ -102,6 +102,8 @@ FIRST_BOOT_FLAG = os.path.join(
PUPPET_HIERADATA_PATH = os.path.join(tsc.PUPPET_PATH, 'hieradata')
LOCK_AGENT_ACTION = 'agent-exclusive-action'
class FakeGlobalSectionHead(object):
def __init__(self, fp):
@ -837,19 +839,8 @@ class AgentManager(service.PeriodicService):
LOG.exception("Sysinv Agent exception updating idisk conductor.")
pass
ipartition = self._ipartition_operator.ipartition_get()
try:
rpcapi.ipartition_update_by_ihost(icontext,
ihost['uuid'],
ipartition)
except AttributeError:
# safe to ignore during upgrades
LOG.warn("Skip updating ipartition conductor. "
"Upgrade in progress?")
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating ipartition"
" conductor.")
pass
self._update_disk_partitions(rpcapi, icontext,
ihost['uuid'], force_update=True)
ipv = self._ipv_operator.ipv_get()
try:
@ -1025,6 +1016,27 @@ class AgentManager(service.PeriodicService):
return False
return True
@utils.synchronized(constants.PARTITION_MANAGE_LOCK)
def _update_disk_partitions(self, rpcapi, icontext,
host_uuid, force_update=False):
ipartition = self._ipartition_operator.ipartition_get()
if not force_update:
if self._prev_partition == ipartition:
return
self._prev_partition = ipartition
try:
rpcapi.ipartition_update_by_ihost(
icontext, host_uuid, ipartition)
except AttributeError:
# safe to ignore during upgrades
LOG.warn("Skip updating ipartition conductor. "
"Upgrade in progress?")
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating "
"ipartition conductor.")
if not force_update:
self._prev_partition = None
@periodic_task.periodic_task(spacing=CONF.agent.audit_interval,
run_immediately=True)
def _agent_audit(self, context):
@ -1032,6 +1044,7 @@ class AgentManager(service.PeriodicService):
self.agent_audit(context, host_uuid=self._ihost_uuid,
force_updates=None)
@utils.synchronized(LOCK_AGENT_ACTION, external=False)
def agent_audit(self, context, host_uuid, force_updates, cinder_device=None):
# perform inventory audit
if self._ihost_uuid != host_uuid:
@ -1199,23 +1212,7 @@ class AgentManager(service.PeriodicService):
# Update disk partitions
if self._ihost_personality != constants.STORAGE:
ipartition = self._ipartition_operator.ipartition_get()
if ((self._prev_partition is None) or
(self._prev_partition != ipartition)):
self._prev_partition = ipartition
try:
rpcapi.ipartition_update_by_ihost(icontext,
self._ihost_uuid,
ipartition)
except AttributeError:
# safe to ignore during upgrades
LOG.warn("Skip updating ipartition conductor. "
"Upgrade in progress?")
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating "
"ipartition conductor.")
self._prev_partition = None
pass
self._update_disk_partitions(rpcapi, icontext, self._ihost_uuid)
# Update physical volumes
ipv = self._ipv_operator.ipv_get(cinder_device=cinder_device)
@ -1357,6 +1354,7 @@ class AgentManager(service.PeriodicService):
self._update_config_applied(iconfig_uuid)
self._report_config_applied(context)
@utils.synchronized(LOCK_AGENT_ACTION, external=False)
def config_apply_runtime_manifest(self, context, config_uuid, config_dict):
"""Asynchronously, have the agent apply the runtime manifest with the
list of supplied tasks.

View File

@ -859,6 +859,11 @@ CONF.register_cli_opt(
handler=add_action_parsers))
@utils.synchronized(constants.PARTITION_MANAGE_LOCK)
def run(action, data, mode, pfile):
action(data, mode, pfile)
def main(argv):
sysinv_service.prepare_service(argv)
global LOG
@ -873,7 +878,8 @@ def main(argv):
"data": CONF.action.data})
LOG.info(msg)
print msg
CONF.action.func(CONF.action.data, CONF.action.mode, CONF.action.pfile)
run(CONF.action.func, CONF.action.data,
CONF.action.mode, CONF.action.pfile)
else:
LOG.error(_("Unknown action: %(action)") % {"action":
CONF.action.name})

View File

@ -1125,6 +1125,8 @@ PARTITION_NAME_PV = "LVM Physical Volume"
PARTITION_TABLE_GPT = "gpt"
PARTITION_TABLE_MSDOS = "msdos"
PARTITION_MANAGE_LOCK = "partition-manage"
# Optional services
ALL_OPTIONAL_SERVICES = [SERVICE_TYPE_CINDER, SERVICE_TYPE_MURANO,
SERVICE_TYPE_MAGNUM, SERVICE_TYPE_SWIFT,

View File

@ -31,11 +31,13 @@ import errno
import functools
import fcntl
import glob
import grp
import hashlib
import itertools as it
import json
import math
import os
import pwd
import random
import re
import shutil
@ -1274,8 +1276,25 @@ def bytes_to_MiB(bytes_number):
return bytes_number / float(1024 ** 2)
def check_lock_path():
if os.path.isdir(constants.SYSINV_LOCK_PATH):
return
try:
uid = pwd.getpwnam(constants.SYSINV_USERNAME).pw_uid
gid = grp.getgrnam(constants.SYSINV_GRPNAME).gr_gid
os.makedirs(constants.SYSINV_LOCK_PATH)
os.chown(constants.SYSINV_LOCK_PATH, uid, gid)
LOG.info("Created directory=%s" %
constants.SYSINV_LOCK_PATH)
except OSError as e:
LOG.exception("makedir %s OSError=%s encountered" %
(constants.SYSINV_LOCK_PATH, e))
def synchronized(name, external=True):
if external:
check_lock_path()
lock_path = constants.SYSINV_LOCK_PATH
else:
lock_path = None

View File

@ -32,11 +32,9 @@ collection of inventory data for each host.
import errno
import filecmp
import glob
import grp
import hashlib
import httplib
import os
import pwd
import re
import shutil
import socket
@ -184,20 +182,7 @@ class ConductorManager(service.PeriodicService):
# create /var/run/sysinv if required. On DOR, the manifests
# may not run to create this volatile directory.
if not os.path.isdir(constants.SYSINV_LOCK_PATH):
try:
uid = pwd.getpwnam(constants.SYSINV_USERNAME).pw_uid
gid = grp.getgrnam(constants.SYSINV_GRPNAME).gr_gid
os.makedirs(constants.SYSINV_LOCK_PATH)
os.chown(constants.SYSINV_LOCK_PATH, uid, gid)
LOG.info("Created directory=%s" %
constants.SYSINV_LOCK_PATH)
except OSError as e:
LOG.exception("makedir %s OSError=%s encountered" %
(constants.SYSINV_LOCK_PATH, e))
pass
cutils.check_lock_path()
system = self._create_default_system()