Monasca agent on ESX hangs on collector autorestart

This improves the likelyhood that the collector daemon will exit during
auto restart. Also removed gevent requirement and usage so only
eventlet is used. The conflict between gevent and eventlet caused
multiproccessing worker join and sys.exit to hang forever.

Change-Id: I60f980f4a74eafb709e51c0f520a81e2d72cb609
This commit is contained in:
James Gu 2017-01-26 23:18:00 -08:00 committed by James Gu
parent 6f7476874b
commit 30d9176e50
5 changed files with 152 additions and 87 deletions

View File

@ -1,10 +1,11 @@
# (C) Copyright 2015,2016 Hewlett Packard Enterprise Development LP # (C) Copyright 2015-2017 Hewlett Packard Enterprise Development LP
# Core modules # Core modules
import logging import logging
from multiprocessing.dummy import Pool from multiprocessing.dummy import Pool
import os import os
import socket import socket
import sys
import threading import threading
import time import time
@ -264,7 +265,7 @@ class Collector(util.Dimensions):
return measurements return measurements
def stop(self): def stop(self, timeout=0):
"""Tell the collector to stop at the next logical point. """Tell the collector to stop at the next logical point.
""" """
# This is called when the process is being killed, so # This is called when the process is being killed, so
@ -273,10 +274,39 @@ class Collector(util.Dimensions):
# because the forwarder is quite possibly already killed # because the forwarder is quite possibly already killed
# in which case we'll get a misleading error in the logs. # in which case we'll get a misleading error in the logs.
# Best to not even try. # Best to not even try.
log.info("stopping the collector with timeout %d seconds" % timeout)
self.continue_running = False self.continue_running = False
for check_name in self.collection_times: for check_name in self.collection_times:
check = self.collection_times[check_name]['check'] check = self.collection_times[check_name]['check']
check.stop() check.stop()
for check_name in self.collection_results:
run_time = time.time() - self.collection_results[check_name]['start_time']
log.info('When exiting... Plugin %s still running after %d seconds' % (
check_name, run_time))
self.pool.close() self.pool.close()
# Can't call self.pool.join() because this is an event thread
# and that will cause a BlockingSwitchOutError # Won't call join() if timeout is zero. If we are in an event thread
# a BlockingSwitchOutError occurs if wait
if (timeout > 0):
timer = util.Timer()
for worker in self.pool._pool:
t = timeout - timer.total()
if t <= 0:
break
if worker.is_alive():
try:
worker.join(t)
except Exception:
log.error("Unexpected error: ", sys.exc_info()[0])
for worker in self.pool._pool:
if worker.is_alive():
# the worker didn't complete in the specified timeout.
# collector must honor the stop request to avoid agent stop/restart hang.
# os._exit() should be called after collector stops.
log.info('worker %s is still alive when collector stop times out.' % worker.name)

View File

@ -1,16 +1,16 @@
# (C) Copyright 2015,2016 Hewlett Packard Enterprise Development Company LP # (C) Copyright 2015-2017 Hewlett Packard Enterprise Development Company LP
import collections import collections
from concurrent import futures
import Queue import Queue
import threading import threading
from gevent import monkey import eventlet
from gevent import Timeout from eventlet.green import time
from multiprocessing.dummy import Pool as ThreadPool import multiprocessing
import monasca_agent.collector.checks import monasca_agent.collector.checks
DEFAULT_TIMEOUT = 180 DEFAULT_TIMEOUT = 180
DEFAULT_SIZE_POOL = 6 DEFAULT_SIZE_POOL = 6
MAX_LOOP_ITERATIONS = 1000 MAX_LOOP_ITERATIONS = 1000
@ -20,7 +20,6 @@ FAILURE = "FAILURE"
up_down = collections.namedtuple('up_down', ['UP', 'DOWN']) up_down = collections.namedtuple('up_down', ['UP', 'DOWN'])
Status = up_down('UP', 'DOWN') Status = up_down('UP', 'DOWN')
EventType = up_down("servicecheck.state_change.up", "servicecheck.state_change.down") EventType = up_down("servicecheck.state_change.up", "servicecheck.state_change.down")
monkey.patch_all()
class ServicesCheck(monasca_agent.collector.checks.AgentCheck): class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
@ -34,7 +33,7 @@ class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
The main agent loop will call the check function for each instance for The main agent loop will call the check function for each instance for
each iteration of the loop. each iteration of the loop.
The check method will make an asynchronous call to the _process method in The check method will make an asynchronous call to the _process method in
one of the thread initiated in the thread pool created in this class constructor. one of the thread pool executors created in this class constructor.
The _process method will call the _check method of the inherited class The _process method will call the _check method of the inherited class
which will perform the actual check. which will perform the actual check.
@ -57,23 +56,26 @@ class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
# The pool size should be the minimum between the number of instances # The pool size should be the minimum between the number of instances
# and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count' # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
# parameter in the init_config of the check # parameter in the init_config of the check
default_size = min(self.instance_count(), DEFAULT_SIZE_POOL) try:
default_size = min(self.instance_count(), multiprocessing.cpu_count() + 1)
except NotImplementedError:
default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
self.pool_size = int(self.init_config.get('threads_count', default_size)) self.pool_size = int(self.init_config.get('threads_count', default_size))
self.timeout = int(self.agent_config.get('timeout', DEFAULT_TIMEOUT)) self.timeout = int(self.agent_config.get('timeout', DEFAULT_TIMEOUT))
def start_pool(self): def start_pool(self):
self.log.info("Starting Thread Pool") if self.pool is None:
self.pool = ThreadPool(self.pool_size) self.log.info("Starting Thread Pool Exceutor")
if threading.activeCount() > MAX_ALLOWED_THREADS: self.pool = futures.ThreadPoolExecutor(max_workers=self.pool_size)
self.log.error("Thread count ({0}) exceeds maximum ({1})".format(threading.activeCount(), if threading.activeCount() > MAX_ALLOWED_THREADS:
MAX_ALLOWED_THREADS)) self.log.error('Thread count (%d) exceeds maximum (%d)' % (threading.activeCount(),
self.running_jobs = set() MAX_ALLOWED_THREADS))
self.running_jobs = {}
def stop_pool(self): def stop_pool(self):
self.log.info("Stopping Thread Pool") self.log.info("Stopping Thread Pool")
if self.pool: if self.pool:
self.pool.close() self.pool.shutdown(wait=True)
self.pool.join()
self.pool = None self.pool = None
def restart_pool(self): def restart_pool(self):
@ -81,80 +83,63 @@ class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
self.start_pool() self.start_pool()
def check(self, instance): def check(self, instance):
if not self.pool: self.start_pool()
self.start_pool()
self._process_results()
name = instance.get('name', None) name = instance.get('name', None)
if name is None: if name is None:
self.log.error('Each service check must have a name') self.log.error('Each service check must have a name')
return return
if name not in self.running_jobs: if (name not in self.running_jobs) or self.running_jobs[name].done():
# A given instance should be processed one at a time # A given instance should be processed one at a time
self.running_jobs.add(name) self.running_jobs[name] = self.pool.submit(self._process, instance)
self.pool.apply_async(self._process, args=(instance,))
else: else:
self.log.info("Instance: %s skipped because it's already running." % name) self.log.info("Instance: %s skipped because it's already running." % name)
def _process(self, instance): def _process(self, instance):
name = instance.get('name', None) name = instance.get('name', None)
try: try:
with Timeout(self.timeout): with eventlet.timeout.Timeout(self.timeout):
return_value = self._check(instance) return_value = self._check(instance)
if not return_value: if not return_value:
return return
status, msg = return_value status, msg = return_value
result = (status, msg, name, instance) self._process_result(status, msg, name, instance)
# We put the results in the result queue except eventlet.Timeout:
self.resultsq.put(result) msg = 'ServiceCheck {0} timed out'.format(name)
except Timeout: self.log.error(msg)
self.log.error('ServiceCheck {0} timed out'.format(name)) self._process_result(FAILURE, msg, name, instance)
except Exception: except Exception:
self.log.exception('Failure in ServiceCheck {0}'.format(name)) msg = 'Failure in ServiceCheck {0}'.format(name)
result = (FAILURE, FAILURE, FAILURE, FAILURE) self.log.exception(msg)
self.resultsq.put(result) self._process_result(FAILURE, msg, name, instance)
finally: finally:
self.running_jobs.remove(name) del self.running_jobs[name]
def _process_results(self): def _process_result(self, status, msg, name, queue_instance):
for i in range(MAX_LOOP_ITERATIONS): if name not in self.statuses:
try: self.statuses[name] = []
# We want to fetch the result in a non blocking way
status, msg, name, queue_instance = self.resultsq.get_nowait()
except Queue.Empty:
break
if status == FAILURE: self.statuses[name].append(status)
self.nb_failures += 1
if self.nb_failures >= self.pool_size - 1:
self.nb_failures = 0
self.restart_pool()
continue
if name not in self.statuses: window = int(queue_instance.get('window', 1))
self.statuses[name] = []
self.statuses[name].append(status) if window > 256:
self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
window = 256
window = int(queue_instance.get('window', 1)) threshold = queue_instance.get('threshold', 1)
if window > 256: if len(self.statuses[name]) > window:
self.log.warning("Maximum window size (256) exceeded, defaulting it to 256") self.statuses[name].pop(0)
window = 256
threshold = queue_instance.get('threshold', 1) nb_failures = self.statuses[name].count(Status.DOWN)
if len(self.statuses[name]) > window: if nb_failures >= threshold:
self.statuses[name].pop(0) if self.notified.get(name, Status.UP) != Status.DOWN:
self.notified[name] = Status.DOWN
nb_failures = self.statuses[name].count(Status.DOWN) else:
if self.notified.get(name, Status.UP) != Status.UP:
if nb_failures >= threshold: self.notified[name] = Status.UP
if self.notified.get(name, Status.UP) != Status.DOWN:
self.notified[name] = Status.DOWN
else:
if self.notified.get(name, Status.UP) != Status.UP:
self.notified[name] = Status.UP
def _check(self, instance): def _check(self, instance):
"""This function should be implemented by inherited classes. """This function should be implemented by inherited classes.

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# (C) Copyright 2015-2016 Hewlett Packard Enterprise Development Company LP # (C) Copyright 2015-2017 Hewlett Packard Enterprise Development LP
# Core modules # Core modules
import glob import glob
@ -53,20 +53,26 @@ class CollectorDaemon(monasca_agent.common.daemon.Daemon):
self.start_event = start_event self.start_event = start_event
def _handle_sigterm(self, signum, frame): def _handle_sigterm(self, signum, frame):
log.debug("Caught sigterm. Stopping run loop.") log.debug("Caught sigterm.")
self._stop(0)
sys.exit(0)
def _handle_sigusr1(self, signum, frame):
log.debug("Caught sigusrl.")
self._stop(120)
sys.exit(monasca_agent.common.daemon.AgentSupervisor.RESTART_EXIT_STATUS)
def _stop(self, timeout=0):
log.info("Stopping collector run loop.")
self.run_forever = False self.run_forever = False
if jmxfetch.JMXFetch.is_running(): if jmxfetch.JMXFetch.is_running():
jmxfetch.JMXFetch.stop() jmxfetch.JMXFetch.stop()
if self.collector: if self.collector:
self.collector.stop() self.collector.stop(timeout)
log.debug("Collector is stopped.")
sys.exit(0)
def _handle_sigusr1(self, signum, frame): log.info('collector stopped')
self._handle_sigterm(signum, frame)
self._do_restart()
def run(self, config): def run(self, config):
"""Main loop of the collector. """Main loop of the collector.
@ -94,6 +100,9 @@ class CollectorDaemon(monasca_agent.common.daemon.Daemon):
self.restart_interval = int(util.get_collector_restart_interval()) self.restart_interval = int(util.get_collector_restart_interval())
self.agent_start = time.time() self.agent_start = time.time()
exitCode = 0
exitTimeout = 0
# Run the main loop. # Run the main loop.
while self.run_forever: while self.run_forever:
collection_start = time.time() collection_start = time.time()
@ -125,7 +134,10 @@ class CollectorDaemon(monasca_agent.common.daemon.Daemon):
# Check if we should restart. # Check if we should restart.
if self.autorestart and self._should_restart(): if self.autorestart and self._should_restart():
self._do_restart() self.run_forever = False
exitCode = monasca_agent.common.daemon.AgentSupervisor.RESTART_EXIT_STATUS
exitTimeout = 120
log.info('Startng an auto restart')
# Only plan for the next loop if we will continue, # Only plan for the next loop if we will continue,
# otherwise just exit quickly. # otherwise just exit quickly.
@ -137,23 +149,18 @@ class CollectorDaemon(monasca_agent.common.daemon.Daemon):
log.info("Collection took {0} which is as long or longer then the configured collection frequency " log.info("Collection took {0} which is as long or longer then the configured collection frequency "
"of {1}. Starting collection again without waiting in result.".format(collection_time, "of {1}. Starting collection again without waiting in result.".format(collection_time,
check_frequency)) check_frequency))
self._stop(exitTimeout)
# Explicitly kill the process, because it might be running # Explicitly kill the process, because it might be running
# as a daemon. # as a daemon.
log.info("Exiting. Bye bye.") log.info("Exiting collector daemon, code %d." % exitCode)
sys.exit(0) os._exit(exitCode)
def _should_restart(self): def _should_restart(self):
if time.time() - self.agent_start > self.restart_interval: if time.time() - self.agent_start > self.restart_interval:
return True return True
return False return False
def _do_restart(self):
log.info("Running an auto-restart.")
if self.collector:
self.collector.stop()
sys.exit(monasca_agent.common.daemon.AgentSupervisor.RESTART_EXIT_STATUS)
def main(): def main():
options, args = util.get_parsed_args() options, args = util.get_parsed_args()

View File

@ -9,7 +9,6 @@ oslo.i18n>=2.1.0 # Apache-2.0
oslo.utils>=3.18.0 # Apache-2.0 oslo.utils>=3.18.0 # Apache-2.0
oslo.vmware>=2.11.0 # Apache-2.0 oslo.vmware>=2.11.0 # Apache-2.0
PyYAML>=3.10.0 # MIT PyYAML>=3.10.0 # MIT
gevent>=1.1.1
httplib2>=0.7.5 # MIT httplib2>=0.7.5 # MIT
netaddr>=0.7.13,!=0.7.16 # BSD netaddr>=0.7.13,!=0.7.16 # BSD
ntplib>=0.3.2,<0.4 ntplib>=0.3.2,<0.4
@ -21,4 +20,6 @@ redis>=2.10.0 # MIT
six>=1.9.0 # MIT six>=1.9.0 # MIT
supervisor>=3.1.3,<3.4 supervisor>=3.1.3,<3.4
stevedore>=1.17.1 # Apache-2.0 stevedore>=1.17.1 # Apache-2.0
tornado>=4.3 tornado>=4.3
futures>=2.1.3
eventlet!=0.18.3,>=0.18.2 # MIT

View File

@ -0,0 +1,42 @@
# (C) Copyright 2017 Hewlett Packard Enterprise Development LP
from unittest import TestCase
from eventlet.green import time
from monasca_agent.collector.checks.services_checks import ServicesCheck
from monasca_agent.collector.checks.services_checks import Status
class DummyServiceCheck(ServicesCheck):
def __init__(self, name, init_config, agent_config, instances=None):
super(DummyServiceCheck, self).__init__(name, init_config, agent_config, instances)
def _check(self, instance):
w = instance.get('service_wait', 5)
time.sleep(w)
return Status.UP, "UP"
def set_timeout(self, timeout):
self.timeout = timeout
class TestServicesCheck(TestCase):
def setUp(self):
TestCase.setUp(self)
pass
def test_service_check_timeout(self):
init_config = {}
agent_config = {'timeout': 4}
instances = []
for wait in [1,2, 6, 8]:
instances.append({'service_wait': wait, 'name' : 'dummy %d' % wait})
self.dummy_service = DummyServiceCheck("dummy service", init_config, agent_config, instances=instances)
self.dummy_service.run()
time.sleep(10)
self.assertEqual(self.dummy_service.statuses['dummy 1'][0], Status.UP)
self.assertEqual(self.dummy_service.statuses['dummy 2'][0], Status.UP)
self.assertEqual(self.dummy_service.statuses['dummy 6'][0], "FAILURE")
self.assertEqual(self.dummy_service.statuses['dummy 8'][0], "FAILURE")