Merge "Asynchronously update node statistics"

This commit is contained in:
Zuul 2018-11-29 21:00:14 +00:00 committed by Gerrit Code Review
commit 81936cd818
5 changed files with 133 additions and 22 deletions

View File

@ -94,7 +94,6 @@ class NodeLauncher(threading.Thread,
try: try:
dt = int((time.monotonic() - start_time) * 1000) dt = int((time.monotonic() - start_time) * 1000)
self.recordLaunchStats(statsd_key, dt) self.recordLaunchStats(statsd_key, dt)
self.updateNodeStats(self.zk, self.provider_config)
except Exception: except Exception:
self.log.exception("Exception while reporting stats:") self.log.exception("Exception while reporting stats:")

View File

@ -46,13 +46,12 @@ LOCK_CLEANUP = 8 * HOURS
SUSPEND_WAIT_TIME = 30 SUSPEND_WAIT_TIME = 30
class NodeDeleter(threading.Thread, stats.StatsReporter): class NodeDeleter(threading.Thread):
log = logging.getLogger("nodepool.NodeDeleter") log = logging.getLogger("nodepool.NodeDeleter")
def __init__(self, zk, provider_manager, node): def __init__(self, zk, provider_manager, node):
threading.Thread.__init__(self, name='NodeDeleter for %s %s' % threading.Thread.__init__(self, name='NodeDeleter for %s %s' %
(node.provider, node.external_id)) (node.provider, node.external_id))
stats.StatsReporter.__init__(self)
self._zk = zk self._zk = zk
self._provider_manager = provider_manager self._provider_manager = provider_manager
self._node = node self._node = node
@ -109,13 +108,8 @@ class NodeDeleter(threading.Thread, stats.StatsReporter):
self.delete(self._zk, self._provider_manager, self._node, node_exists) self.delete(self._zk, self._provider_manager, self._node, node_exists)
try:
self.updateNodeStats(self._zk, self._provider_manager.provider)
except Exception:
self.log.exception("Exception while reporting stats:")
class PoolWorker(threading.Thread, stats.StatsReporter):
class PoolWorker(threading.Thread):
''' '''
Class that manages node requests for a single provider pool. Class that manages node requests for a single provider pool.
@ -143,6 +137,7 @@ class PoolWorker(threading.Thread):
self.launcher_id = "%s-%s-%s" % (socket.gethostname(), self.launcher_id = "%s-%s-%s" % (socket.gethostname(),
os.getpid(), os.getpid(),
self.name) self.name)
stats.StatsReporter.__init__(self)
# --------------------------------------------------------------- # ---------------------------------------------------------------
# Private methods # Private methods
@ -294,8 +289,12 @@ class PoolWorker(threading.Thread):
launcher.id = self.launcher_id launcher.id = self.launcher_id
for prov_cfg in self.nodepool.config.providers.values(): for prov_cfg in self.nodepool.config.providers.values():
launcher.supported_labels.update(prov_cfg.getSupportedLabels()) launcher.supported_labels.update(prov_cfg.getSupportedLabels())
launcher.provider_name = self.provider_name
self.zk.registerLauncher(launcher) self.zk.registerLauncher(launcher)
self.updateProviderLimits(
self.nodepool.config.providers.get(self.provider_name))
try: try:
if not self.paused_handler: if not self.paused_handler:
self._assignHandlers() self._assignHandlers()
@ -699,6 +698,70 @@ class DeletedNodeWorker(BaseCleanupWorker):
self.log.exception("Exception in DeletedNodeWorker:") self.log.exception("Exception in DeletedNodeWorker:")
class StatsWorker(BaseCleanupWorker, stats.StatsReporter):
def __init__(self, nodepool, interval):
super().__init__(nodepool, interval, name='StatsWorker')
self.log = logging.getLogger('nodepool.StatsWorker')
self.stats_event = threading.Event()
self.election = None
def stop(self):
self._running = False
if self.election is not None:
self.log.debug('Cancel leader election')
self.election.cancel()
self.stats_event.set()
super().stop()
def _run(self):
try:
stats.StatsReporter.__init__(self)
if not self._statsd:
return
if self.election is None:
zk = self._nodepool.getZK()
identifier = "%s-%s" % (socket.gethostname(), os.getpid())
self.election = zk.getStatsElection(identifier)
if not self._running:
return
self.election.run(self._run_stats)
except Exception:
self.log.exception('Exception in StatsWorker:')
def _run_stats(self):
self.log.info('Won stats reporter election')
# enable us getting events
zk = self._nodepool.getZK()
zk.setNodeStatsEvent(self.stats_event)
while self._running:
signaled = self.stats_event.wait()
if not self._running:
break
if not signaled:
continue
self.log.debug('Updating stats')
self.stats_event.clear()
try:
self.updateNodeStats(zk)
except Exception:
self.log.exception("Exception while reporting stats:")
time.sleep(1)
# Unregister from node stats events
zk.setNodeStatsEvent(None)
class NodePool(threading.Thread): class NodePool(threading.Thread):
log = logging.getLogger("nodepool.NodePool") log = logging.getLogger("nodepool.NodePool")
@ -710,6 +773,7 @@ class NodePool(threading.Thread):
self.watermark_sleep = watermark_sleep self.watermark_sleep = watermark_sleep
self.cleanup_interval = 60 self.cleanup_interval = 60
self.delete_interval = 5 self.delete_interval = 5
self.stats_interval = 5
self._stopped = False self._stopped = False
self._stop_event = threading.Event() self._stop_event = threading.Event()
self.config = None self.config = None
@ -718,6 +782,7 @@ class NodePool(threading.Thread):
self._pool_threads = {} self._pool_threads = {}
self._cleanup_thread = None self._cleanup_thread = None
self._delete_thread = None self._delete_thread = None
self._stats_thread = None
self._submittedRequests = {} self._submittedRequests = {}
def stop(self): def stop(self):
@ -738,6 +803,10 @@ class NodePool(threading.Thread):
self._delete_thread.stop() self._delete_thread.stop()
self._delete_thread.join() self._delete_thread.join()
if self._stats_thread:
self._stats_thread.stop()
self._stats_thread.join()
# Don't let stop() return until all pool threads have been # Don't let stop() return until all pool threads have been
# terminated. # terminated.
self.log.debug("Stopping pool threads") self.log.debug("Stopping pool threads")
@ -950,6 +1019,10 @@ class NodePool(threading.Thread):
self, self.delete_interval) self, self.delete_interval)
self._delete_thread.start() self._delete_thread.start()
if not self._stats_thread:
self._stats_thread = StatsWorker(self, self.stats_interval)
self._stats_thread.start()
# Stop any PoolWorker threads if the pool was removed # Stop any PoolWorker threads if the pool was removed
# from the config. # from the config.
pool_keys = set() pool_keys = set()

View File

@ -85,39 +85,40 @@ class StatsReporter(object):
pipeline.incr(key) pipeline.incr(key)
pipeline.send() pipeline.send()
def updateNodeStats(self, zk_conn, provider): def updateNodeStats(self, zk_conn):
''' '''
Refresh statistics for all known nodes. Refresh statistics for all known nodes.
:param ZooKeeper zk_conn: A ZooKeeper connection object. :param ZooKeeper zk_conn: A ZooKeeper connection object.
:param Provider provider: A config Provider object.
''' '''
if not self._statsd: if not self._statsd:
return return
states = {} states = {}
launchers = zk_conn.getRegisteredLaunchers()
labels = set()
for launcher in launchers:
labels.update(launcher.supported_labels)
providers = set()
for launcher in launchers:
providers.add(launcher.provider_name)
# Initialize things we know about to zero # Initialize things we know about to zero
for state in zk.Node.VALID_STATES: for state in zk.Node.VALID_STATES:
key = 'nodepool.nodes.%s' % state key = 'nodepool.nodes.%s' % state
states[key] = 0 states[key] = 0
key = 'nodepool.provider.%s.nodes.%s' % (provider.name, state) for provider in providers:
states[key] = 0 key = 'nodepool.provider.%s.nodes.%s' % (provider, state)
states[key] = 0
# Initialize label stats to 0 # Initialize label stats to 0
for label in provider.getSupportedLabels(): for label in labels:
for state in zk.Node.VALID_STATES: for state in zk.Node.VALID_STATES:
key = 'nodepool.label.%s.nodes.%s' % (label, state) key = 'nodepool.label.%s.nodes.%s' % (label, state)
states[key] = 0 states[key] = 0
# Note that we intentionally don't use caching here because we don't for node in zk_conn.nodeIterator():
# know when the next update will happen and thus need to report the
# correct most recent state. Otherwise we can end up in reporting
# a gauge with a node in state deleting = 1 and never update this for
# a long time.
# TODO(tobiash): Changing updateNodeStats to just run periodically will
# resolve this and we can operate on cached data.
for node in zk_conn.nodeIterator(cached=False):
# nodepool.nodes.STATE # nodepool.nodes.STATE
key = 'nodepool.nodes.%s' % node.state key = 'nodepool.nodes.%s' % node.state
states[key] += 1 states[key] += 1
@ -145,9 +146,18 @@ class StatsReporter(object):
for key, count in states.items(): for key, count in states.items():
pipeline.gauge(key, count) pipeline.gauge(key, count)
pipeline.send()
def updateProviderLimits(self, provider):
if not self._statsd:
return
pipeline = self._statsd.pipeline()
# nodepool.provider.PROVIDER.max_servers # nodepool.provider.PROVIDER.max_servers
key = 'nodepool.provider.%s.max_servers' % provider.name key = 'nodepool.provider.%s.max_servers' % provider.name
max_servers = sum([p.max_servers for p in provider.pools.values() max_servers = sum([p.max_servers for p in provider.pools.values()
if p.max_servers]) if p.max_servers])
pipeline.gauge(key, max_servers) pipeline.gauge(key, max_servers)
pipeline.send() pipeline.send()

View File

@ -207,6 +207,7 @@ class BaseTestCase(testtools.TestCase):
'fake-provider3', 'fake-provider3',
'CleanupWorker', 'CleanupWorker',
'DeletedNodeWorker', 'DeletedNodeWorker',
'StatsWorker',
'pydevd.CommandThread', 'pydevd.CommandThread',
'pydevd.Reader', 'pydevd.Reader',
'pydevd.Writer', 'pydevd.Writer',

View File

@ -23,6 +23,7 @@ from kazoo import exceptions as kze
from kazoo.handlers.threading import KazooTimeoutError from kazoo.handlers.threading import KazooTimeoutError
from kazoo.recipe.lock import Lock from kazoo.recipe.lock import Lock
from kazoo.recipe.cache import TreeCache, TreeEvent from kazoo.recipe.cache import TreeCache, TreeEvent
from kazoo.recipe.election import Election
from nodepool import exceptions as npe from nodepool import exceptions as npe
@ -164,6 +165,7 @@ class Launcher(Serializable):
def __init__(self): def __init__(self):
self.id = None self.id = None
self.provider_name = None
self._supported_labels = set() self._supported_labels = set()
def __eq__(self, other): def __eq__(self, other):
@ -186,6 +188,7 @@ class Launcher(Serializable):
def toDict(self): def toDict(self):
d = {} d = {}
d['id'] = self.id d['id'] = self.id
d['provider_name'] = self.provider_name
# sets are not JSON serializable, so use a sorted list # sets are not JSON serializable, so use a sorted list
d['supported_labels'] = sorted(self.supported_labels) d['supported_labels'] = sorted(self.supported_labels)
return d return d
@ -194,6 +197,10 @@ class Launcher(Serializable):
def fromDict(d): def fromDict(d):
obj = Launcher() obj = Launcher()
obj.id = d.get('id') obj.id = d.get('id')
# TODO(tobiash): The fallback to 'unknown' is only needed to avoid
# having a full nodepool shutdown on upgrade. It can be
# removed later.
obj.provider_name = d.get('provider_name', 'unknown')
obj.supported_labels = set(d.get('supported_labels', [])) obj.supported_labels = set(d.get('supported_labels', []))
return obj return obj
@ -693,6 +700,7 @@ class ZooKeeper(object):
NODE_ROOT = "/nodepool/nodes" NODE_ROOT = "/nodepool/nodes"
REQUEST_ROOT = "/nodepool/requests" REQUEST_ROOT = "/nodepool/requests"
REQUEST_LOCK_ROOT = "/nodepool/requests-lock" REQUEST_LOCK_ROOT = "/nodepool/requests-lock"
ELECTION_ROOT = "/nodepool/elections"
# Log zookeeper retry every 10 seconds # Log zookeeper retry every 10 seconds
retry_log_rate = 10 retry_log_rate = 10
@ -710,10 +718,15 @@ class ZooKeeper(object):
self._cached_node_requests = {} self._cached_node_requests = {}
self.enable_cache = enable_cache self.enable_cache = enable_cache
self.node_stats_event = None
# ======================================================================= # =======================================================================
# Private Methods # Private Methods
# ======================================================================= # =======================================================================
def _electionPath(self, election):
return "%s/%s" % (self.ELECTION_ROOT, election)
def _imagePath(self, image): def _imagePath(self, image):
return "%s/%s" % (self.IMAGE_ROOT, image) return "%s/%s" % (self.IMAGE_ROOT, image)
@ -2106,6 +2119,10 @@ class ZooKeeper(object):
node = Node.fromDict(d, node_id) node = Node.fromDict(d, node_id)
node.stat = event.event_data.stat node.stat = event.event_data.stat
self._cached_nodes[node_id] = node self._cached_nodes[node_id] = node
# set the stats event so the stats reporting thread can act upon it
if self.node_stats_event is not None:
self.node_stats_event.set()
elif event.event_type == TreeEvent.NODE_REMOVED: elif event.event_type == TreeEvent.NODE_REMOVED:
try: try:
del self._cached_nodes[node_id] del self._cached_nodes[node_id]
@ -2113,6 +2130,13 @@ class ZooKeeper(object):
# If it's already gone, don't care # If it's already gone, don't care
pass pass
# set the stats event so the stats reporting thread can act upon it
if self.node_stats_event is not None:
self.node_stats_event.set()
def setNodeStatsEvent(self, event):
self.node_stats_event = event
def requestCacheListener(self, event): def requestCacheListener(self, event):
if hasattr(event.event_data, 'path'): if hasattr(event.event_data, 'path'):
@ -2158,3 +2182,7 @@ class ZooKeeper(object):
except KeyError: except KeyError:
# If it's already gone, don't care # If it's already gone, don't care
pass pass
def getStatsElection(self, identifier):
path = self._electionPath('stats')
return Election(self.client, path, identifier)