Log cpu times of ansible executions

We need to be able to compare and discover ansible performance
regressions or improvements of ansible. Currently we have no way of
detecting changes there other than observing the overall system load
of executors. One way to get some metrics is to log the cpu times used
by individual ansible runs and the sum of them over the whole job
execution. With this one could grab that data from the log and analyse
them.

Change-Id: Ib0b62299c741533f0d1615f67eced9601498f00d
This commit is contained in:
Tobias Henkel 2018-07-14 10:23:57 +02:00
parent 11709e7044
commit 5a4db84e5a
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
3 changed files with 25 additions and 3 deletions

View File

@ -19,6 +19,7 @@ import fcntl
import grp import grp
import logging import logging
import os import os
import psutil
import pwd import pwd
import shlex import shlex
import subprocess import subprocess
@ -52,7 +53,7 @@ class WrappedPopen(object):
if fd not in pass_fds: if fd not in pass_fds:
pass_fds.append(fd) pass_fds.append(fd)
kwargs['pass_fds'] = pass_fds kwargs['pass_fds'] = pass_fds
proc = subprocess.Popen(args, *sub_args, **kwargs) proc = psutil.Popen(args, *sub_args, **kwargs)
finally: finally:
self.__del__() self.__del__()
return proc return proc

View File

@ -15,7 +15,7 @@
# under the License. # under the License.
import logging import logging
import subprocess import psutil
from zuul.driver import (Driver, WrapperInterface) from zuul.driver import (Driver, WrapperInterface)
from zuul.execution_context import BaseExecutionContext from zuul.execution_context import BaseExecutionContext
@ -25,7 +25,7 @@ class NullExecutionContext(BaseExecutionContext):
log = logging.getLogger("zuul.NullExecutionContext") log = logging.getLogger("zuul.NullExecutionContext")
def getPopen(self, **kwargs): def getPopen(self, **kwargs):
return subprocess.Popen return psutil.Popen
class NullwrapDriver(Driver, WrapperInterface): class NullwrapDriver(Driver, WrapperInterface):

View File

@ -636,6 +636,9 @@ class AnsibleJob(object):
self.executor_variables_file = None self.executor_variables_file = None
self.cpu_times = {'user': 0, 'system': 0,
'children_user': 0, 'children_system': 0}
if self.executor_server.config.has_option('executor', 'variables'): if self.executor_server.config.has_option('executor', 'variables'):
self.executor_variables_file = self.executor_server.config.get( self.executor_variables_file = self.executor_server.config.get(
'executor', 'variables') 'executor', 'variables')
@ -952,6 +955,13 @@ class AnsibleJob(object):
pre_failed = True pre_failed = True
break break
self.log.debug(
"Overall ansible cpu times: user=%.2f, system=%.2f, "
"children_user=%.2f, children_system=%.2f" %
(self.cpu_times['user'], self.cpu_times['system'],
self.cpu_times['children_user'],
self.cpu_times['children_system']))
if not pre_failed: if not pre_failed:
ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout) ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
job_status, job_code = self.runAnsiblePlaybook( job_status, job_code = self.runAnsiblePlaybook(
@ -1575,6 +1585,17 @@ class AnsibleJob(object):
line = line[:1024].rstrip() line = line[:1024].rstrip()
self.log.debug("Ansible output: %s" % (line,)) self.log.debug("Ansible output: %s" % (line,))
self.log.debug("Ansible output terminated") self.log.debug("Ansible output terminated")
cpu_times = self.proc.cpu_times()
self.log.debug("Ansible cpu times: user=%.2f, system=%.2f, "
"children_user=%.2f, "
"children_system=%.2f" %
(cpu_times.user, cpu_times.system,
cpu_times.children_user,
cpu_times.children_system))
self.cpu_times['user'] += cpu_times.user
self.cpu_times['system'] += cpu_times.system
self.cpu_times['children_user'] += cpu_times.children_user
self.cpu_times['children_system'] += cpu_times.children_system
ret = self.proc.wait() ret = self.proc.wait()
self.log.debug("Ansible exit code: %s" % (ret,)) self.log.debug("Ansible exit code: %s" % (ret,))
finally: finally: