zuul/zuul/cmd/executor.py
James E. Blair c0484c9d7c Sacrifice Ansible procs when OOM
When Linux runs out of memory and activates the OOM killer, it
scores processes based on how much memory they are using[1].  If
a job triggers an OOM by causing ansible-playbook to use a lot
of RAM, normally we would expect the OOM killer to kill Ansible.
However, if the executor is busy, it may be using a lot of RAM
as well, and its score may exceed the score of the smaller
Ansible process.  Nonetheless, we would still rather kill the
Ansible process.

This adjusts the score for the bubblewrap and ansible processes
so that they will have a score increased by an amount equal to
about 20% of system RAM.  This effectively means that as long
as the executor uses less than 20% of system RAM, it is guaranteed
to score lower than Ansible (and likely will continue to score
lower for some significant amount over that as well, depending
on how much RAM Ansible is using).

We read the executor's oom_score_adj when we initialize the bwrap
driver and add 200 to it in order to accomodate the situation where
the executor has its own oom_score_adj.  We always want the bwrap
children to have a higher score than the executor.

The choom program adjusts the OOM score for the command that it
executes, and this is inherited by child processes.  So we adjust
bwrap and expect ansible-playbook to inherit it.

It is also possible to adjust the score of the exeucotor process
lower (so the executor could be made less likely to be a target)
but that requires root privileges, so is not implemented in this
change.

[1] https://lxr.linux.no/#linux+v6.7.1/mm/oom_kill.c#L201

Change-Id: I3a3d116cf68b84b8a6f9ec13808d1d2c2008008f
2024-06-03 09:12:57 -07:00

134 lines
4.5 KiB
Python
Executable File

# Copyright 2012 Hewlett-Packard Development Company, L.P.
# Copyright 2013-2014 OpenStack Foundation
# Copyright 2021-2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import os
import sys
import signal
import zuul.cmd
import zuul.executor.server
from zuul.lib.config import get_default
class Executor(zuul.cmd.ZuulDaemonApp):
app_name = 'executor'
app_description = 'A standalone Zuul executor.'
def createParser(self):
parser = super(Executor, self).createParser()
parser.add_argument('--keep-jobdir', dest='keep_jobdir',
action='store_true',
help='keep local jobdirs after run completes')
self.addSubCommands(parser, zuul.executor.server.COMMANDS)
return parser
def parseArguments(self, args=None):
super(Executor, self).parseArguments()
def exit_handler(self, signum, frame):
if self.config.has_option('executor', 'sigterm_method'):
graceful = self.config.get('executor', 'sigterm_method')
else:
graceful = 'graceful'
if graceful.lower() == 'graceful':
self.executor.graceful()
elif graceful.lower() == 'stop':
self.executor.stop()
else:
self.log.error("Unknown value for executor.sigterm_method:"
f"'{graceful}'. Expected 'graceful' or 'stop'")
self.executor.graceful()
def start_log_streamer(self):
pipe_read, pipe_write = os.pipe()
child_pid = os.fork()
if child_pid == 0:
os.close(pipe_write)
import zuul.lib.log_streamer
self.log.info("Starting log streamer")
streamer = zuul.lib.log_streamer.LogStreamer(
'::', self.finger_port, self.job_dir)
# Keep running until the parent dies:
pipe_read = os.fdopen(pipe_read)
try:
pipe_read.read()
except KeyboardInterrupt:
pass
self.log.info("Stopping log streamer")
streamer.stop()
os._exit(0)
else:
os.close(pipe_read)
self.log_streamer_pipe = pipe_write
self.log_streamer_pid = child_pid
def run(self):
self.handleCommands()
self.setup_logging('executor', 'log_config')
self.log = logging.getLogger("zuul.Executor")
self.configure_connections(source_only=True, check_bwrap=True)
if self.config.has_option('executor', 'job_dir'):
self.job_dir = os.path.expanduser(
self.config.get('executor', 'job_dir'))
if not os.path.isdir(self.job_dir):
print("Invalid job_dir: {job_dir}".format(
job_dir=self.job_dir))
sys.exit(1)
else:
self.job_dir = '/var/lib/zuul/builds'
if not os.path.exists(self.job_dir):
os.mkdir(self.job_dir)
self.finger_port = int(
get_default(self.config, 'executor', 'finger_port',
zuul.executor.server.DEFAULT_FINGER_PORT)
)
self.start_log_streamer()
ExecutorServer = zuul.executor.server.ExecutorServer
self.executor = ExecutorServer(self.config,
self.connections,
jobdir_root=self.job_dir,
keep_jobdir=self.args.keep_jobdir,
log_streaming_port=self.finger_port)
self.executor.start()
if self.args.nodaemon:
signal.signal(signal.SIGTERM, self.exit_handler)
while True:
try:
self.executor.join()
break
except KeyboardInterrupt:
print("Ctrl + C: asking executor to exit nicely...\n")
self.exit_handler(signal.SIGINT, None)
os.close(self.log_streamer_pipe)
os.waitpid(self.log_streamer_pid, 0)
def main():
Executor().main()