c0484c9d7c
When Linux runs out of memory and activates the OOM killer, it scores processes based on how much memory they are using[1]. If a job triggers an OOM by causing ansible-playbook to use a lot of RAM, normally we would expect the OOM killer to kill Ansible. However, if the executor is busy, it may be using a lot of RAM as well, and its score may exceed the score of the smaller Ansible process. Nonetheless, we would still rather kill the Ansible process. This adjusts the score for the bubblewrap and ansible processes so that they will have a score increased by an amount equal to about 20% of system RAM. This effectively means that as long as the executor uses less than 20% of system RAM, it is guaranteed to score lower than Ansible (and likely will continue to score lower for some significant amount over that as well, depending on how much RAM Ansible is using). We read the executor's oom_score_adj when we initialize the bwrap driver and add 200 to it in order to accomodate the situation where the executor has its own oom_score_adj. We always want the bwrap children to have a higher score than the executor. The choom program adjusts the OOM score for the command that it executes, and this is inherited by child processes. So we adjust bwrap and expect ansible-playbook to inherit it. It is also possible to adjust the score of the exeucotor process lower (so the executor could be made less likely to be a target) but that requires root privileges, so is not implemented in this change. [1] https://lxr.linux.no/#linux+v6.7.1/mm/oom_kill.c#L201 Change-Id: I3a3d116cf68b84b8a6f9ec13808d1d2c2008008f
134 lines
4.5 KiB
Python
Executable File
134 lines
4.5 KiB
Python
Executable File
# Copyright 2012 Hewlett-Packard Development Company, L.P.
|
|
# Copyright 2013-2014 OpenStack Foundation
|
|
# Copyright 2021-2022 Acme Gating, LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import signal
|
|
|
|
import zuul.cmd
|
|
import zuul.executor.server
|
|
|
|
from zuul.lib.config import get_default
|
|
|
|
|
|
class Executor(zuul.cmd.ZuulDaemonApp):
|
|
app_name = 'executor'
|
|
app_description = 'A standalone Zuul executor.'
|
|
|
|
def createParser(self):
|
|
parser = super(Executor, self).createParser()
|
|
parser.add_argument('--keep-jobdir', dest='keep_jobdir',
|
|
action='store_true',
|
|
help='keep local jobdirs after run completes')
|
|
self.addSubCommands(parser, zuul.executor.server.COMMANDS)
|
|
return parser
|
|
|
|
def parseArguments(self, args=None):
|
|
super(Executor, self).parseArguments()
|
|
|
|
def exit_handler(self, signum, frame):
|
|
if self.config.has_option('executor', 'sigterm_method'):
|
|
graceful = self.config.get('executor', 'sigterm_method')
|
|
else:
|
|
graceful = 'graceful'
|
|
if graceful.lower() == 'graceful':
|
|
self.executor.graceful()
|
|
elif graceful.lower() == 'stop':
|
|
self.executor.stop()
|
|
else:
|
|
self.log.error("Unknown value for executor.sigterm_method:"
|
|
f"'{graceful}'. Expected 'graceful' or 'stop'")
|
|
self.executor.graceful()
|
|
|
|
def start_log_streamer(self):
|
|
pipe_read, pipe_write = os.pipe()
|
|
child_pid = os.fork()
|
|
if child_pid == 0:
|
|
os.close(pipe_write)
|
|
import zuul.lib.log_streamer
|
|
|
|
self.log.info("Starting log streamer")
|
|
streamer = zuul.lib.log_streamer.LogStreamer(
|
|
'::', self.finger_port, self.job_dir)
|
|
|
|
# Keep running until the parent dies:
|
|
pipe_read = os.fdopen(pipe_read)
|
|
try:
|
|
pipe_read.read()
|
|
except KeyboardInterrupt:
|
|
pass
|
|
self.log.info("Stopping log streamer")
|
|
streamer.stop()
|
|
os._exit(0)
|
|
else:
|
|
os.close(pipe_read)
|
|
self.log_streamer_pipe = pipe_write
|
|
self.log_streamer_pid = child_pid
|
|
|
|
def run(self):
|
|
self.handleCommands()
|
|
|
|
self.setup_logging('executor', 'log_config')
|
|
self.log = logging.getLogger("zuul.Executor")
|
|
|
|
self.configure_connections(source_only=True, check_bwrap=True)
|
|
|
|
if self.config.has_option('executor', 'job_dir'):
|
|
self.job_dir = os.path.expanduser(
|
|
self.config.get('executor', 'job_dir'))
|
|
if not os.path.isdir(self.job_dir):
|
|
print("Invalid job_dir: {job_dir}".format(
|
|
job_dir=self.job_dir))
|
|
sys.exit(1)
|
|
else:
|
|
self.job_dir = '/var/lib/zuul/builds'
|
|
if not os.path.exists(self.job_dir):
|
|
os.mkdir(self.job_dir)
|
|
|
|
self.finger_port = int(
|
|
get_default(self.config, 'executor', 'finger_port',
|
|
zuul.executor.server.DEFAULT_FINGER_PORT)
|
|
)
|
|
|
|
self.start_log_streamer()
|
|
|
|
ExecutorServer = zuul.executor.server.ExecutorServer
|
|
self.executor = ExecutorServer(self.config,
|
|
self.connections,
|
|
jobdir_root=self.job_dir,
|
|
keep_jobdir=self.args.keep_jobdir,
|
|
log_streaming_port=self.finger_port)
|
|
self.executor.start()
|
|
|
|
if self.args.nodaemon:
|
|
signal.signal(signal.SIGTERM, self.exit_handler)
|
|
|
|
while True:
|
|
try:
|
|
self.executor.join()
|
|
break
|
|
except KeyboardInterrupt:
|
|
print("Ctrl + C: asking executor to exit nicely...\n")
|
|
self.exit_handler(signal.SIGINT, None)
|
|
|
|
os.close(self.log_streamer_pipe)
|
|
os.waitpid(self.log_streamer_pid, 0)
|
|
|
|
|
|
def main():
|
|
Executor().main()
|