The Gatekeeper, or a project gating system
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

3091 lines
123 KiB

  1. # Copyright 2014 OpenStack Foundation
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  4. # not use this file except in compliance with the License. You may obtain
  5. # a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  11. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  12. # License for the specific language governing permissions and limitations
  13. # under the License.
  14. import base64
  15. import collections
  16. import datetime
  17. import json
  18. import logging
  19. import multiprocessing
  20. import os
  21. import psutil
  22. import shutil
  23. import signal
  24. import shlex
  25. import socket
  26. import subprocess
  27. import tempfile
  28. import threading
  29. import time
  30. import traceback
  31. from concurrent.futures.process import ProcessPoolExecutor, BrokenProcessPool
  32. import re
  33. import git
  34. from urllib.parse import urlsplit
  35. from zuul.lib.ansible import AnsibleManager
  36. from zuul.lib.gearworker import ZuulGearWorker
  37. from zuul.lib.yamlutil import yaml
  38. from zuul.lib.config import get_default
  39. from zuul.lib.logutil import get_annotated_logger
  40. from zuul.lib.statsd import get_statsd
  41. from zuul.lib import filecomments
  42. import gear
  43. import zuul.lib.repl
  44. import zuul.merger.merger
  45. import zuul.ansible.logconfig
  46. from zuul.executor.sensors.cpu import CPUSensor
  47. from zuul.executor.sensors.hdd import HDDSensor
  48. from zuul.executor.sensors.pause import PauseSensor
  49. from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
  50. from zuul.executor.sensors.ram import RAMSensor
  51. from zuul.lib import commandsocket
  52. from zuul.merger.server import BaseMergeServer, RepoLocks
  53. BUFFER_LINES_FOR_SYNTAX = 200
  54. COMMANDS = ['stop', 'pause', 'unpause', 'graceful', 'verbose',
  55. 'unverbose', 'keep', 'nokeep', 'repl', 'norepl']
  56. DEFAULT_FINGER_PORT = 7900
  57. DEFAULT_STREAM_PORT = 19885
  58. BLACKLISTED_ANSIBLE_CONNECTION_TYPES = [
  59. 'network_cli', 'kubectl', 'project', 'namespace']
  60. BLACKLISTED_VARS = dict(
  61. ansible_ssh_executable='ssh',
  62. ansible_ssh_common_args='-o PermitLocalCommand=no',
  63. ansible_sftp_extra_args='-o PermitLocalCommand=no',
  64. ansible_scp_extra_args='-o PermitLocalCommand=no',
  65. ansible_ssh_extra_args='-o PermitLocalCommand=no',
  66. )
  67. class StopException(Exception):
  68. """An exception raised when an inner loop is asked to stop."""
  69. pass
  70. class ExecutorError(Exception):
  71. """A non-transient run-time executor error
  72. This class represents error conditions detected by the executor
  73. when preparing to run a job which we know are consistently fatal.
  74. Zuul should not reschedule the build in these cases.
  75. """
  76. pass
  77. class RoleNotFoundError(ExecutorError):
  78. pass
  79. class PluginFoundError(ExecutorError):
  80. pass
  81. class DiskAccountant(object):
  82. ''' A single thread to periodically run du and monitor a base directory
  83. Whenever the accountant notices a dir over limit, it will call the
  84. given func with an argument of the job directory. That function
  85. should be used to remediate the problem, generally by killing the
  86. job producing the disk bloat). The function will be called every
  87. time the problem is noticed, so it should be handled synchronously
  88. to avoid stacking up calls.
  89. '''
  90. log = logging.getLogger("zuul.ExecutorDiskAccountant")
  91. def __init__(self, jobs_base, limit, func, cache_dir, usage_func=None):
  92. '''
  93. :param str jobs_base: absolute path name of dir to be monitored
  94. :param int limit: maximum number of MB allowed to be in use in any one
  95. subdir
  96. :param callable func: Function to call with overlimit dirs
  97. :param str cache_dir: absolute path name of dir to be passed as the
  98. first argument to du. This will ensure du does
  99. not count any hardlinks to files in this
  100. directory against a single job.
  101. :param callable usage_func: Optional function to call with usage
  102. for every dir _NOT_ over limit
  103. '''
  104. # Don't cross the streams
  105. if cache_dir == jobs_base:
  106. raise Exception("Cache dir and jobs dir cannot be the same")
  107. self.thread = threading.Thread(target=self._run,
  108. name='diskaccountant')
  109. self.thread.daemon = True
  110. self._running = False
  111. self.jobs_base = jobs_base
  112. self.limit = limit
  113. self.func = func
  114. self.cache_dir = cache_dir
  115. self.usage_func = usage_func
  116. self.stop_event = threading.Event()
  117. def _run(self):
  118. while self._running:
  119. # Walk job base
  120. before = time.time()
  121. du = subprocess.Popen(
  122. ['du', '-m', '--max-depth=1', self.cache_dir, self.jobs_base],
  123. stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
  124. for line in du.stdout:
  125. (size, dirname) = line.rstrip().split()
  126. dirname = dirname.decode('utf8')
  127. if dirname == self.jobs_base or dirname == self.cache_dir:
  128. continue
  129. if os.path.dirname(dirname) == self.cache_dir:
  130. continue
  131. size = int(size)
  132. if size > self.limit:
  133. self.log.warning(
  134. "{job} is using {size}MB (limit={limit})"
  135. .format(size=size, job=dirname, limit=self.limit))
  136. self.func(dirname)
  137. elif self.usage_func:
  138. self.log.debug(
  139. "{job} is using {size}MB (limit={limit})"
  140. .format(size=size, job=dirname, limit=self.limit))
  141. self.usage_func(dirname, size)
  142. du.wait()
  143. du.stdout.close()
  144. after = time.time()
  145. # Sleep half as long as that took, or 1s, whichever is longer
  146. delay_time = max((after - before) / 2, 1.0)
  147. self.stop_event.wait(delay_time)
  148. def start(self):
  149. if self.limit < 0:
  150. # No need to start if there is no limit.
  151. return
  152. self._running = True
  153. self.thread.start()
  154. def stop(self):
  155. if not self.running:
  156. return
  157. self._running = False
  158. self.stop_event.set()
  159. self.thread.join()
  160. @property
  161. def running(self):
  162. return self._running
  163. class Watchdog(object):
  164. def __init__(self, timeout, function, args):
  165. self.timeout = timeout
  166. self.function = function
  167. self.args = args
  168. self.thread = threading.Thread(target=self._run,
  169. name='watchdog')
  170. self.thread.daemon = True
  171. self.timed_out = None
  172. self.end = 0
  173. self._running = False
  174. self._stop_event = threading.Event()
  175. def _run(self):
  176. while self._running and time.time() < self.end:
  177. self._stop_event.wait(10)
  178. if self._running:
  179. self.timed_out = True
  180. self.function(*self.args)
  181. else:
  182. # Only set timed_out to false if we aren't _running
  183. # anymore. This means that we stopped running not because
  184. # of a timeout but because normal execution ended.
  185. self.timed_out = False
  186. def start(self):
  187. self._running = True
  188. self.end = time.time() + self.timeout
  189. self.thread.start()
  190. def stop(self):
  191. self._running = False
  192. self._stop_event.set()
  193. class SshAgent(object):
  194. def __init__(self, zuul_event_id=None, build=None):
  195. self.env = {}
  196. self.ssh_agent = None
  197. self.log = get_annotated_logger(
  198. logging.getLogger("zuul.ExecutorServer"),
  199. zuul_event_id, build=build)
  200. def start(self):
  201. if self.ssh_agent:
  202. return
  203. with open('/dev/null', 'r+') as devnull:
  204. ssh_agent = subprocess.Popen(['ssh-agent'], close_fds=True,
  205. stdout=subprocess.PIPE,
  206. stderr=devnull,
  207. stdin=devnull)
  208. (output, _) = ssh_agent.communicate()
  209. output = output.decode('utf8')
  210. for line in output.split("\n"):
  211. if '=' in line:
  212. line = line.split(";", 1)[0]
  213. (key, value) = line.split('=')
  214. self.env[key] = value
  215. self.log.info('Started SSH Agent, {}'.format(self.env))
  216. def stop(self):
  217. if 'SSH_AGENT_PID' in self.env:
  218. try:
  219. os.kill(int(self.env['SSH_AGENT_PID']), signal.SIGTERM)
  220. except OSError:
  221. self.log.exception(
  222. 'Problem sending SIGTERM to agent {}'.format(self.env))
  223. self.log.debug('Sent SIGTERM to SSH Agent, {}'.format(self.env))
  224. self.env = {}
  225. def __del__(self):
  226. try:
  227. self.stop()
  228. except Exception:
  229. self.log.exception('Exception in SshAgent destructor')
  230. try:
  231. super().__del__(self)
  232. except AttributeError:
  233. pass
  234. def add(self, key_path):
  235. env = os.environ.copy()
  236. env.update(self.env)
  237. key_path = os.path.expanduser(key_path)
  238. self.log.debug('Adding SSH Key {}'.format(key_path))
  239. try:
  240. subprocess.check_output(['ssh-add', key_path], env=env,
  241. stderr=subprocess.PIPE)
  242. except subprocess.CalledProcessError as e:
  243. self.log.exception('ssh-add failed. stdout: %s, stderr: %s',
  244. e.output, e.stderr)
  245. raise
  246. self.log.info('Added SSH Key {}'.format(key_path))
  247. def addData(self, name, key_data):
  248. env = os.environ.copy()
  249. env.update(self.env)
  250. self.log.debug('Adding SSH Key {}'.format(name))
  251. try:
  252. subprocess.check_output(['ssh-add', '-'], env=env,
  253. input=key_data.encode('utf8'),
  254. stderr=subprocess.PIPE)
  255. except subprocess.CalledProcessError as e:
  256. self.log.exception('ssh-add failed. stdout: %s, stderr: %s',
  257. e.output, e.stderr)
  258. raise
  259. self.log.info('Added SSH Key {}'.format(name))
  260. def remove(self, key_path):
  261. env = os.environ.copy()
  262. env.update(self.env)
  263. key_path = os.path.expanduser(key_path)
  264. self.log.debug('Removing SSH Key {}'.format(key_path))
  265. subprocess.check_output(['ssh-add', '-d', key_path], env=env,
  266. stderr=subprocess.PIPE)
  267. self.log.info('Removed SSH Key {}'.format(key_path))
  268. def list(self):
  269. if 'SSH_AUTH_SOCK' not in self.env:
  270. return None
  271. env = os.environ.copy()
  272. env.update(self.env)
  273. result = []
  274. for line in subprocess.Popen(['ssh-add', '-L'], env=env,
  275. stdout=subprocess.PIPE).stdout:
  276. line = line.decode('utf8')
  277. if line.strip() == 'The agent has no identities.':
  278. break
  279. result.append(line.strip())
  280. return result
  281. class KubeFwd(object):
  282. kubectl_command = 'kubectl'
  283. def __init__(self, zuul_event_id, build, kubeconfig, context,
  284. namespace, pod):
  285. self.port = None
  286. self.fwd = None
  287. self.log = get_annotated_logger(
  288. logging.getLogger("zuul.ExecutorServer"),
  289. zuul_event_id, build=build)
  290. self.kubeconfig = kubeconfig
  291. self.context = context
  292. self.namespace = namespace
  293. self.pod = pod
  294. def start(self):
  295. if self.fwd:
  296. return
  297. with open('/dev/null', 'r+') as devnull:
  298. fwd = subprocess.Popen(
  299. [self.kubectl_command, '--kubeconfig=%s' % self.kubeconfig,
  300. '--context=%s' % self.context,
  301. '-n', self.namespace,
  302. 'port-forward',
  303. 'pod/%s' % self.pod, ':19885'],
  304. close_fds=True,
  305. stdout=subprocess.PIPE,
  306. stderr=subprocess.STDOUT,
  307. stdin=devnull)
  308. line = fwd.stdout.readline().decode('utf8')
  309. m = re.match(r'^Forwarding from 127.0.0.1:(\d+) -> 19885', line)
  310. if m:
  311. self.port = m.group(1)
  312. else:
  313. try:
  314. self.log.error("Could not find the forwarded port: %s", line)
  315. fwd.kill()
  316. except Exception:
  317. pass
  318. raise Exception("Unable to start kubectl port forward")
  319. self.fwd = fwd
  320. self.log.info('Started Kubectl port forward on port {}'.format(
  321. self.port))
  322. def stop(self):
  323. try:
  324. if self.fwd:
  325. self.fwd.kill()
  326. self.fwd.wait()
  327. # clear stdout buffer before its gone to not miss out on
  328. # potential connection errors
  329. fwd_stdout = [line.decode('utf8') for line in self.fwd.stdout]
  330. self.log.debug(
  331. "Rest of kubectl port forward output was: %s",
  332. "".join(fwd_stdout)
  333. )
  334. self.fwd = None
  335. except Exception:
  336. self.log.exception('Unable to stop kubectl port-forward:')
  337. def __del__(self):
  338. try:
  339. self.stop()
  340. except Exception:
  341. self.log.exception('Exception in KubeFwd destructor')
  342. try:
  343. super().__del__(self)
  344. except AttributeError:
  345. pass
  346. class JobDirPlaybook(object):
  347. def __init__(self, root):
  348. self.root = root
  349. self.trusted = None
  350. self.project_canonical_name = None
  351. self.branch = None
  352. self.canonical_name_and_path = None
  353. self.path = None
  354. self.roles = []
  355. self.roles_path = []
  356. self.ansible_config = os.path.join(self.root, 'ansible.cfg')
  357. self.project_link = os.path.join(self.root, 'project')
  358. self.secrets_root = os.path.join(self.root, 'secrets')
  359. os.makedirs(self.secrets_root)
  360. self.secrets = os.path.join(self.secrets_root, 'secrets.yaml')
  361. self.secrets_content = None
  362. def addRole(self):
  363. count = len(self.roles)
  364. root = os.path.join(self.root, 'role_%i' % (count,))
  365. os.makedirs(root)
  366. self.roles.append(root)
  367. return root
  368. class JobDir(object):
  369. def __init__(self, root, keep, build_uuid):
  370. '''
  371. :param str root: Root directory for the individual job directories.
  372. Can be None to use the default system temp root directory.
  373. :param bool keep: If True, do not delete the job directory.
  374. :param str build_uuid: The unique build UUID. If supplied, this will
  375. be used as the temp job directory name. Using this will help the
  376. log streaming daemon find job logs.
  377. '''
  378. # root
  379. # ansible (mounted in bwrap read-only)
  380. # logging.json
  381. # inventory.yaml
  382. # extra_vars.yaml
  383. # vars_blacklist.yaml
  384. # .ansible (mounted in bwrap read-write)
  385. # fact-cache/localhost
  386. # cp
  387. # playbook_0 (mounted in bwrap for each playbook read-only)
  388. # secrets.yaml
  389. # project -> ../trusted/project_0/...
  390. # role_0 -> ../trusted/project_0/...
  391. # trusted (mounted in bwrap read-only)
  392. # project_0
  393. # <git.example.com>
  394. # <project>
  395. # untrusted (mounted in bwrap read-only)
  396. # project_0
  397. # <git.example.com>
  398. # <project>
  399. # work (mounted in bwrap read-write)
  400. # .ssh
  401. # known_hosts
  402. # .kube
  403. # config
  404. # src
  405. # <git.example.com>
  406. # <project>
  407. # logs
  408. # job-output.txt
  409. # tmp
  410. # results.json
  411. self.keep = keep
  412. if root:
  413. tmpdir = root
  414. else:
  415. tmpdir = tempfile.gettempdir()
  416. self.root = os.path.realpath(os.path.join(tmpdir, build_uuid))
  417. os.mkdir(self.root, 0o700)
  418. self.work_root = os.path.join(self.root, 'work')
  419. os.makedirs(self.work_root)
  420. self.src_root = os.path.join(self.work_root, 'src')
  421. os.makedirs(self.src_root)
  422. self.log_root = os.path.join(self.work_root, 'logs')
  423. os.makedirs(self.log_root)
  424. # Create local tmp directory
  425. # NOTE(tobiash): This must live within the work root as it can be used
  426. # by ansible for temporary files which are path checked in untrusted
  427. # jobs.
  428. self.local_tmp = os.path.join(self.work_root, 'tmp')
  429. os.makedirs(self.local_tmp)
  430. self.ansible_root = os.path.join(self.root, 'ansible')
  431. os.makedirs(self.ansible_root)
  432. self.ansible_vars_blacklist = os.path.join(
  433. self.ansible_root, 'vars_blacklist.yaml')
  434. with open(self.ansible_vars_blacklist, 'w') as blacklist:
  435. blacklist.write(json.dumps(BLACKLISTED_VARS))
  436. self.trusted_root = os.path.join(self.root, 'trusted')
  437. os.makedirs(self.trusted_root)
  438. self.untrusted_root = os.path.join(self.root, 'untrusted')
  439. os.makedirs(self.untrusted_root)
  440. ssh_dir = os.path.join(self.work_root, '.ssh')
  441. os.mkdir(ssh_dir, 0o700)
  442. kube_dir = os.path.join(self.work_root, ".kube")
  443. os.makedirs(kube_dir)
  444. self.kubeconfig = os.path.join(kube_dir, "config")
  445. # Create ansible cache directory
  446. self.ansible_cache_root = os.path.join(self.root, '.ansible')
  447. self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
  448. os.makedirs(self.fact_cache)
  449. self.control_path = os.path.join(self.ansible_cache_root, 'cp')
  450. self.job_unreachable_file = os.path.join(self.ansible_cache_root,
  451. 'nodes.unreachable')
  452. os.makedirs(self.control_path)
  453. localhost_facts = os.path.join(self.fact_cache, 'localhost')
  454. jobtime = datetime.datetime.utcnow()
  455. date_time_facts = {}
  456. date_time_facts['year'] = jobtime.strftime('%Y')
  457. date_time_facts['month'] = jobtime.strftime('%m')
  458. date_time_facts['weekday'] = jobtime.strftime('%A')
  459. date_time_facts['weekday_number'] = jobtime.strftime('%w')
  460. date_time_facts['weeknumber'] = jobtime.strftime('%W')
  461. date_time_facts['day'] = jobtime.strftime('%d')
  462. date_time_facts['hour'] = jobtime.strftime('%H')
  463. date_time_facts['minute'] = jobtime.strftime('%M')
  464. date_time_facts['second'] = jobtime.strftime('%S')
  465. date_time_facts['epoch'] = jobtime.strftime('%s')
  466. date_time_facts['date'] = jobtime.strftime('%Y-%m-%d')
  467. date_time_facts['time'] = jobtime.strftime('%H:%M:%S')
  468. date_time_facts['iso8601_micro'] = \
  469. jobtime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
  470. date_time_facts['iso8601'] = \
  471. jobtime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
  472. date_time_facts['iso8601_basic'] = jobtime.strftime("%Y%m%dT%H%M%S%f")
  473. date_time_facts['iso8601_basic_short'] = \
  474. jobtime.strftime("%Y%m%dT%H%M%S")
  475. # Set the TZ data manually as jobtime is naive.
  476. date_time_facts['tz'] = 'UTC'
  477. date_time_facts['tz_offset'] = '+0000'
  478. executor_facts = {}
  479. executor_facts['date_time'] = date_time_facts
  480. executor_facts['module_setup'] = True
  481. # NOTE(pabelanger): We do not want to leak zuul-executor facts to other
  482. # playbooks now that smart fact gathering is enabled by default. We
  483. # can have ansible skip populating the cache with information by
  484. # writing a file with the minimum facts we want.
  485. with open(localhost_facts, 'w') as f:
  486. json.dump(executor_facts, f)
  487. self.result_data_file = os.path.join(self.work_root, 'results.json')
  488. with open(self.result_data_file, 'w'):
  489. pass
  490. self.known_hosts = os.path.join(ssh_dir, 'known_hosts')
  491. self.inventory = os.path.join(self.ansible_root, 'inventory.yaml')
  492. self.extra_vars = os.path.join(self.ansible_root, 'extra_vars.yaml')
  493. self.setup_inventory = os.path.join(self.ansible_root,
  494. 'setup-inventory.yaml')
  495. self.logging_json = os.path.join(self.ansible_root, 'logging.json')
  496. self.playbooks = [] # The list of candidate playbooks
  497. self.pre_playbooks = []
  498. self.post_playbooks = []
  499. self.cleanup_playbooks = []
  500. self.job_output_file = os.path.join(self.log_root, 'job-output.txt')
  501. # We need to create the job-output.txt upfront in order to close the
  502. # gap between url reporting and ansible creating the file. Otherwise
  503. # there is a period of time where the user can click on the live log
  504. # link on the status page but the log streaming fails because the file
  505. # is not there yet.
  506. with open(self.job_output_file, 'w') as job_output:
  507. job_output.write("{now} | Job console starting...\n".format(
  508. now=datetime.datetime.now()
  509. ))
  510. self.trusted_projects = []
  511. self.trusted_project_index = {}
  512. self.untrusted_projects = []
  513. self.untrusted_project_index = {}
  514. # Create a JobDirPlaybook for the Ansible setup run. This
  515. # doesn't use an actual playbook, but it lets us use the same
  516. # methods to write an ansible.cfg as the rest of the Ansible
  517. # runs.
  518. setup_root = os.path.join(self.ansible_root, 'setup_playbook')
  519. os.makedirs(setup_root)
  520. self.setup_playbook = JobDirPlaybook(setup_root)
  521. self.setup_playbook.trusted = True
  522. def addTrustedProject(self, canonical_name, branch):
  523. # Trusted projects are placed in their own directories so that
  524. # we can support using different branches of the same project
  525. # in different playbooks.
  526. count = len(self.trusted_projects)
  527. root = os.path.join(self.trusted_root, 'project_%i' % (count,))
  528. os.makedirs(root)
  529. self.trusted_projects.append(root)
  530. self.trusted_project_index[(canonical_name, branch)] = root
  531. return root
  532. def getTrustedProject(self, canonical_name, branch):
  533. return self.trusted_project_index.get((canonical_name, branch))
  534. def addUntrustedProject(self, canonical_name, branch):
  535. # Similar to trusted projects, but these hold checkouts of
  536. # projects which are allowed to have speculative changes
  537. # applied. They might, however, be different branches than
  538. # what is used in the working dir, so they need their own
  539. # location. Moreover, we might avoid mischief if a job alters
  540. # the contents of the working dir.
  541. count = len(self.untrusted_projects)
  542. root = os.path.join(self.untrusted_root, 'project_%i' % (count,))
  543. os.makedirs(root)
  544. self.untrusted_projects.append(root)
  545. self.untrusted_project_index[(canonical_name, branch)] = root
  546. return root
  547. def getUntrustedProject(self, canonical_name, branch):
  548. return self.untrusted_project_index.get((canonical_name, branch))
  549. def addPrePlaybook(self):
  550. count = len(self.pre_playbooks)
  551. root = os.path.join(self.ansible_root, 'pre_playbook_%i' % (count,))
  552. os.makedirs(root)
  553. playbook = JobDirPlaybook(root)
  554. self.pre_playbooks.append(playbook)
  555. return playbook
  556. def addPostPlaybook(self):
  557. count = len(self.post_playbooks)
  558. root = os.path.join(self.ansible_root, 'post_playbook_%i' % (count,))
  559. os.makedirs(root)
  560. playbook = JobDirPlaybook(root)
  561. self.post_playbooks.append(playbook)
  562. return playbook
  563. def addCleanupPlaybook(self):
  564. count = len(self.cleanup_playbooks)
  565. root = os.path.join(
  566. self.ansible_root, 'cleanup_playbook_%i' % (count,))
  567. os.makedirs(root)
  568. playbook = JobDirPlaybook(root)
  569. self.cleanup_playbooks.append(playbook)
  570. return playbook
  571. def addPlaybook(self):
  572. count = len(self.playbooks)
  573. root = os.path.join(self.ansible_root, 'playbook_%i' % (count,))
  574. os.makedirs(root)
  575. playbook = JobDirPlaybook(root)
  576. self.playbooks.append(playbook)
  577. return playbook
  578. def cleanup(self):
  579. if not self.keep:
  580. shutil.rmtree(self.root)
  581. def __enter__(self):
  582. return self
  583. def __exit__(self, etype, value, tb):
  584. self.cleanup()
  585. class UpdateTask(object):
  586. def __init__(self, connection_name, project_name, repo_state=None,
  587. zuul_event_id=None, build=None):
  588. self.connection_name = connection_name
  589. self.project_name = project_name
  590. self.repo_state = repo_state
  591. self.canonical_name = None
  592. self.branches = None
  593. self.refs = None
  594. self.event = threading.Event()
  595. self.success = False
  596. # These variables are used for log annotation
  597. self.zuul_event_id = zuul_event_id
  598. self.build = build
  599. def __eq__(self, other):
  600. if (other and other.connection_name == self.connection_name and
  601. other.project_name == self.project_name and
  602. other.repo_state == self.repo_state):
  603. return True
  604. return False
  605. def wait(self):
  606. self.event.wait()
  607. def setComplete(self):
  608. self.event.set()
  609. class DeduplicateQueue(object):
  610. def __init__(self):
  611. self.queue = collections.deque()
  612. self.condition = threading.Condition()
  613. def qsize(self):
  614. return len(self.queue)
  615. def put(self, item):
  616. # Returns the original item if added, or an equivalent item if
  617. # already enqueued.
  618. self.condition.acquire()
  619. ret = None
  620. try:
  621. for x in self.queue:
  622. if item == x:
  623. ret = x
  624. if ret is None:
  625. ret = item
  626. self.queue.append(item)
  627. self.condition.notify()
  628. finally:
  629. self.condition.release()
  630. return ret
  631. def get(self):
  632. self.condition.acquire()
  633. try:
  634. while True:
  635. try:
  636. ret = self.queue.popleft()
  637. return ret
  638. except IndexError:
  639. pass
  640. self.condition.wait()
  641. finally:
  642. self.condition.release()
  643. def check_varnames(var):
  644. # We block these in configloader, but block it here too to make
  645. # sure that a job doesn't pass variables named zuul or nodepool.
  646. if 'zuul' in var:
  647. raise Exception("Defining variables named 'zuul' is not allowed")
  648. if 'nodepool' in var:
  649. raise Exception("Defining variables named 'nodepool' is not allowed")
  650. def make_setup_inventory_dict(nodes):
  651. hosts = {}
  652. for node in nodes:
  653. if (node['host_vars']['ansible_connection'] in
  654. BLACKLISTED_ANSIBLE_CONNECTION_TYPES):
  655. continue
  656. hosts[node['name']] = node['host_vars']
  657. inventory = {
  658. 'all': {
  659. 'hosts': hosts,
  660. }
  661. }
  662. return inventory
  663. def is_group_var_set(name, host, args):
  664. for group in args['groups']:
  665. if host in group['nodes']:
  666. group_vars = args['group_vars'].get(group['name'], {})
  667. if name in group_vars:
  668. return True
  669. return False
  670. def make_inventory_dict(nodes, args, all_vars):
  671. hosts = {}
  672. for node in nodes:
  673. hosts[node['name']] = node['host_vars']
  674. zuul_vars = all_vars['zuul']
  675. if 'message' in zuul_vars:
  676. zuul_vars['message'] = base64.b64encode(
  677. zuul_vars['message'].encode("utf-8")).decode('utf-8')
  678. inventory = {
  679. 'all': {
  680. 'hosts': hosts,
  681. 'vars': all_vars,
  682. }
  683. }
  684. for group in args['groups']:
  685. if 'children' not in inventory['all']:
  686. inventory['all']['children'] = dict()
  687. group_hosts = {}
  688. for node_name in group['nodes']:
  689. group_hosts[node_name] = None
  690. group_vars = args['group_vars'].get(group['name'], {}).copy()
  691. check_varnames(group_vars)
  692. inventory['all']['children'].update({
  693. group['name']: {
  694. 'hosts': group_hosts,
  695. 'vars': group_vars,
  696. }})
  697. return inventory
  698. class AnsibleJob(object):
  699. RESULT_NORMAL = 1
  700. RESULT_TIMED_OUT = 2
  701. RESULT_UNREACHABLE = 3
  702. RESULT_ABORTED = 4
  703. RESULT_DISK_FULL = 5
  704. RESULT_MAP = {
  705. RESULT_NORMAL: 'RESULT_NORMAL',
  706. RESULT_TIMED_OUT: 'RESULT_TIMED_OUT',
  707. RESULT_UNREACHABLE: 'RESULT_UNREACHABLE',
  708. RESULT_ABORTED: 'RESULT_ABORTED',
  709. RESULT_DISK_FULL: 'RESULT_DISK_FULL',
  710. }
  711. def __init__(self, executor_server, job):
  712. logger = logging.getLogger("zuul.AnsibleJob")
  713. self.arguments = json.loads(job.arguments)
  714. self.zuul_event_id = self.arguments.get('zuul_event_id')
  715. # Record ansible version being used for the cleanup phase
  716. self.ansible_version = self.arguments.get('ansible_version')
  717. self.log = get_annotated_logger(
  718. logger, self.zuul_event_id, build=job.unique)
  719. self.executor_server = executor_server
  720. self.job = job
  721. self.jobdir = None
  722. self.proc = None
  723. self.proc_lock = threading.Lock()
  724. self.running = False
  725. self.started = False # Whether playbooks have started running
  726. self.time_starting_build = None
  727. self.paused = False
  728. self.aborted = False
  729. self.aborted_reason = None
  730. self.cleanup_started = False
  731. self._resume_event = threading.Event()
  732. self.thread = None
  733. self.project_info = {}
  734. self.private_key_file = get_default(self.executor_server.config,
  735. 'executor', 'private_key_file',
  736. '~/.ssh/id_rsa')
  737. self.winrm_key_file = get_default(self.executor_server.config,
  738. 'executor', 'winrm_cert_key_file',
  739. '~/.winrm/winrm_client_cert.key')
  740. self.winrm_pem_file = get_default(self.executor_server.config,
  741. 'executor', 'winrm_cert_pem_file',
  742. '~/.winrm/winrm_client_cert.pem')
  743. self.winrm_operation_timeout = get_default(
  744. self.executor_server.config,
  745. 'executor',
  746. 'winrm_operation_timeout_sec')
  747. self.winrm_read_timeout = get_default(
  748. self.executor_server.config,
  749. 'executor',
  750. 'winrm_read_timeout_sec')
  751. self.ssh_agent = SshAgent(zuul_event_id=self.zuul_event_id,
  752. build=self.job.unique)
  753. self.port_forwards = []
  754. self.executor_variables_file = None
  755. self.cpu_times = {'user': 0, 'system': 0,
  756. 'children_user': 0, 'children_system': 0}
  757. if self.executor_server.config.has_option('executor', 'variables'):
  758. self.executor_variables_file = self.executor_server.config.get(
  759. 'executor', 'variables')
  760. plugin_dir = self.executor_server.ansible_manager.getAnsiblePluginDir(
  761. self.arguments.get('ansible_version'))
  762. self.ara_callbacks = \
  763. self.executor_server.ansible_manager.getAraCallbackPlugin(
  764. self.arguments.get('ansible_version'))
  765. self.library_dir = os.path.join(plugin_dir, 'library')
  766. self.action_dir = os.path.join(plugin_dir, 'action')
  767. self.action_dir_general = os.path.join(plugin_dir, 'actiongeneral')
  768. self.action_dir_trusted = os.path.join(plugin_dir, 'actiontrusted')
  769. self.callback_dir = os.path.join(plugin_dir, 'callback')
  770. self.lookup_dir = os.path.join(plugin_dir, 'lookup')
  771. self.filter_dir = os.path.join(plugin_dir, 'filter')
  772. self.ansible_callbacks = self.executor_server.ansible_callbacks
  773. def run(self):
  774. self.running = True
  775. self.thread = threading.Thread(target=self.execute,
  776. name='build-%s' % self.job.unique)
  777. self.thread.start()
  778. def stop(self, reason=None):
  779. self.aborted = True
  780. self.aborted_reason = reason
  781. # if paused we need to resume the job so it can be stopped
  782. self.resume()
  783. self.abortRunningProc()
  784. def pause(self):
  785. self.log.info(
  786. "Pausing job %s for ref %s (change %s)" % (
  787. self.arguments['zuul']['job'],
  788. self.arguments['zuul']['ref'],
  789. self.arguments['zuul']['change_url']))
  790. with open(self.jobdir.job_output_file, 'a') as job_output:
  791. job_output.write(
  792. "{now} |\n"
  793. "{now} | Job paused\n".format(now=datetime.datetime.now()))
  794. self.paused = True
  795. data = {'paused': self.paused, 'data': self.getResultData()}
  796. self.job.sendWorkData(json.dumps(data))
  797. self._resume_event.wait()
  798. def resume(self):
  799. if not self.paused:
  800. return
  801. self.log.info(
  802. "Resuming job %s for ref %s (change %s)" % (
  803. self.arguments['zuul']['job'],
  804. self.arguments['zuul']['ref'],
  805. self.arguments['zuul']['change_url']))
  806. with open(self.jobdir.job_output_file, 'a') as job_output:
  807. job_output.write(
  808. "{now} | Job resumed\n"
  809. "{now} |\n".format(now=datetime.datetime.now()))
  810. self.paused = False
  811. self._resume_event.set()
  812. def wait(self):
  813. if self.thread:
  814. self.thread.join()
  815. def execute(self):
  816. try:
  817. self.time_starting_build = time.monotonic()
  818. # report that job has been taken
  819. self.job.sendWorkData(json.dumps(self._base_job_data()))
  820. self.ssh_agent.start()
  821. self.ssh_agent.add(self.private_key_file)
  822. for key in self.arguments.get('ssh_keys', []):
  823. self.ssh_agent.addData(key['name'], key['key'])
  824. self.jobdir = JobDir(self.executor_server.jobdir_root,
  825. self.executor_server.keep_jobdir,
  826. str(self.job.unique))
  827. self._execute()
  828. except BrokenProcessPool:
  829. # The process pool got broken, re-initialize it and send
  830. # ABORTED so we re-try the job.
  831. self.log.exception('Process pool got broken')
  832. self.executor_server.resetProcessPool()
  833. self._send_aborted()
  834. except ExecutorError as e:
  835. result_data = json.dumps(dict(result='ERROR',
  836. error_detail=e.args[0]))
  837. self.log.debug("Sending result: %s" % (result_data,))
  838. self.job.sendWorkComplete(result_data)
  839. except Exception:
  840. self.log.exception("Exception while executing job")
  841. self.job.sendWorkException(traceback.format_exc())
  842. finally:
  843. self.running = False
  844. if self.jobdir:
  845. try:
  846. self.jobdir.cleanup()
  847. except Exception:
  848. self.log.exception("Error cleaning up jobdir:")
  849. if self.ssh_agent:
  850. try:
  851. self.ssh_agent.stop()
  852. except Exception:
  853. self.log.exception("Error stopping SSH agent:")
  854. for fwd in self.port_forwards:
  855. try:
  856. fwd.stop()
  857. except Exception:
  858. self.log.exception("Error stopping port forward:")
  859. try:
  860. self.executor_server.finishJob(self.job.unique)
  861. except Exception:
  862. self.log.exception("Error finalizing job thread:")
  863. self.log.info("Job execution took: %.3f seconds" % (
  864. time.monotonic() - self.time_starting_build))
  865. def _base_job_data(self):
  866. return {
  867. # TODO(mordred) worker_name is needed as a unique name for the
  868. # client to use for cancelling jobs on an executor. It's
  869. # defaulting to the hostname for now, but in the future we
  870. # should allow setting a per-executor override so that one can
  871. # run more than one executor on a host.
  872. 'worker_name': self.executor_server.hostname,
  873. 'worker_hostname': self.executor_server.hostname,
  874. 'worker_log_port': self.executor_server.log_streaming_port,
  875. }
  876. def _send_aborted(self):
  877. result = dict(result='ABORTED')
  878. self.job.sendWorkComplete(json.dumps(result))
  879. def _execute(self):
  880. args = self.arguments
  881. self.log.info(
  882. "Beginning job %s for ref %s (change %s)" % (
  883. args['zuul']['job'],
  884. args['zuul']['ref'],
  885. args['zuul']['change_url']))
  886. self.log.debug("Job root: %s" % (self.jobdir.root,))
  887. tasks = []
  888. projects = set()
  889. repo_state = args['repo_state']
  890. # Make sure all projects used by the job are updated...
  891. for project in args['projects']:
  892. self.log.debug("Updating project %s" % (project,))
  893. tasks.append(self.executor_server.update(
  894. project['connection'], project['name'],
  895. repo_state=repo_state,
  896. zuul_event_id=self.zuul_event_id,
  897. build=self.job.unique))
  898. projects.add((project['connection'], project['name']))
  899. # ...as well as all playbook and role projects.
  900. repos = []
  901. playbooks = (args['pre_playbooks'] + args['playbooks'] +
  902. args['post_playbooks'] + args['cleanup_playbooks'])
  903. for playbook in playbooks:
  904. repos.append(playbook)
  905. repos += playbook['roles']
  906. for repo in repos:
  907. key = (repo['connection'], repo['project'])
  908. if key not in projects:
  909. self.log.debug("Updating playbook or role %s" % (
  910. repo['project'],))
  911. tasks.append(self.executor_server.update(
  912. *key, repo_state=repo_state,
  913. zuul_event_id=self.zuul_event_id,
  914. build=self.job.unique))
  915. projects.add(key)
  916. for task in tasks:
  917. task.wait()
  918. if not task.success:
  919. raise ExecutorError(
  920. 'Failed to update project %s' % task.canonical_name)
  921. self.project_info[task.canonical_name] = {
  922. 'refs': task.refs,
  923. 'branches': task.branches,
  924. }
  925. # Early abort if abort requested
  926. if self.aborted:
  927. self._send_aborted()
  928. return
  929. self.log.debug("Git updates complete")
  930. merger = self.executor_server._getMerger(
  931. self.jobdir.src_root,
  932. self.executor_server.merge_root,
  933. self.log)
  934. repos = {}
  935. for project in args['projects']:
  936. self.log.debug("Cloning %s/%s" % (project['connection'],
  937. project['name'],))
  938. repo = merger.getRepo(project['connection'],
  939. project['name'])
  940. repos[project['canonical_name']] = repo
  941. # The commit ID of the original item (before merging). Used
  942. # later for line mapping.
  943. item_commit = None
  944. merge_items = [i for i in args['items'] if i.get('number')]
  945. if merge_items:
  946. item_commit = self.doMergeChanges(
  947. merger, merge_items, repo_state)
  948. if item_commit is None:
  949. # There was a merge conflict and we have already sent
  950. # a work complete result, don't run any jobs
  951. return
  952. # Early abort if abort requested
  953. if self.aborted:
  954. self._send_aborted()
  955. return
  956. state_items = [i for i in args['items'] if not i.get('number')]
  957. if state_items:
  958. merger.setRepoState(
  959. state_items, repo_state,
  960. process_worker=self.executor_server.process_worker)
  961. # Early abort if abort requested
  962. if self.aborted:
  963. self._send_aborted()
  964. return
  965. for project in args['projects']:
  966. repo = repos[project['canonical_name']]
  967. # If this project is the Zuul project and this is a ref
  968. # rather than a change, checkout the ref.
  969. if (project['canonical_name'] ==
  970. args['zuul']['project']['canonical_name'] and
  971. (not args['zuul'].get('branch')) and
  972. args['zuul'].get('ref')):
  973. ref = args['zuul']['ref']
  974. else:
  975. ref = None
  976. selected_ref, selected_desc = self.resolveBranch(
  977. project['canonical_name'],
  978. ref,
  979. args['branch'],
  980. args['override_branch'],
  981. args['override_checkout'],
  982. project['override_branch'],
  983. project['override_checkout'],
  984. project['default_branch'])
  985. self.log.info("Checking out %s %s %s",
  986. project['canonical_name'], selected_desc,
  987. selected_ref)
  988. repo.checkout(selected_ref)
  989. # Update the inventory variables to indicate the ref we
  990. # checked out
  991. p = args['zuul']['projects'][project['canonical_name']]
  992. p['checkout'] = selected_ref
  993. # Set the URL of the origin remote for each repo to a bogus
  994. # value. Keeping the remote allows tools to use it to determine
  995. # which commits are part of the current change.
  996. for repo in repos.values():
  997. repo.setRemoteUrl('file:///dev/null')
  998. # Early abort if abort requested
  999. if self.aborted:
  1000. self._send_aborted()
  1001. return
  1002. # This prepares each playbook and the roles needed for each.
  1003. self.preparePlaybooks(args)
  1004. self.prepareAnsibleFiles(args)
  1005. self.writeLoggingConfig()
  1006. # Early abort if abort requested
  1007. if self.aborted:
  1008. self._send_aborted()
  1009. return
  1010. data = self._base_job_data()
  1011. if self.executor_server.log_streaming_port != DEFAULT_FINGER_PORT:
  1012. data['url'] = "finger://{hostname}:{port}/{uuid}".format(
  1013. hostname=self.executor_server.hostname,
  1014. port=self.executor_server.log_streaming_port,
  1015. uuid=self.job.unique)
  1016. else:
  1017. data['url'] = 'finger://{hostname}/{uuid}'.format(
  1018. hostname=self.executor_server.hostname,
  1019. uuid=self.job.unique)
  1020. self.job.sendWorkData(json.dumps(data))
  1021. self.job.sendWorkStatus(0, 100)
  1022. result = self.runPlaybooks(args)
  1023. success = result == 'SUCCESS'
  1024. self.runCleanupPlaybooks(success)
  1025. # Stop the persistent SSH connections.
  1026. setup_status, setup_code = self.runAnsibleCleanup(
  1027. self.jobdir.setup_playbook)
  1028. if self.aborted_reason == self.RESULT_DISK_FULL:
  1029. result = 'DISK_FULL'
  1030. data = self.getResultData()
  1031. warnings = []
  1032. self.mapLines(merger, args, data, item_commit, warnings)
  1033. result_data = json.dumps(dict(result=result,
  1034. warnings=warnings,
  1035. data=data))
  1036. self.log.debug("Sending result: %s" % (result_data,))
  1037. self.job.sendWorkComplete(result_data)
  1038. def getResultData(self):
  1039. data = {}
  1040. try:
  1041. with open(self.jobdir.result_data_file) as f:
  1042. file_data = f.read()
  1043. if file_data:
  1044. data = json.loads(file_data)
  1045. except Exception:
  1046. self.log.exception("Unable to load result data:")
  1047. return data
  1048. def mapLines(self, merger, args, data, commit, warnings):
  1049. # The data and warnings arguments are mutated in this method.
  1050. # If we received file comments, map the line numbers before
  1051. # we send the result.
  1052. fc = data.get('zuul', {}).get('file_comments')
  1053. if not fc:
  1054. return
  1055. disable = data.get('zuul', {}).get('disable_file_comment_line_mapping')
  1056. if disable:
  1057. return
  1058. try:
  1059. filecomments.validate(fc)
  1060. except Exception as e:
  1061. warnings.append("Job %s: validation error in file comments: %s" %
  1062. (args['zuul']['job'], str(e)))
  1063. del data['zuul']['file_comments']
  1064. return
  1065. repo = None
  1066. for project in args['projects']:
  1067. if (project['canonical_name'] !=
  1068. args['zuul']['project']['canonical_name']):
  1069. continue
  1070. repo = merger.getRepo(project['connection'],
  1071. project['name'])
  1072. # If the repo doesn't exist, abort
  1073. if not repo:
  1074. return
  1075. # Check out the selected ref again in case the job altered the
  1076. # repo state.
  1077. p = args['zuul']['projects'][project['canonical_name']]
  1078. selected_ref = p['checkout']
  1079. self.log.info("Checking out %s %s for line mapping",
  1080. project['canonical_name'], selected_ref)
  1081. try:
  1082. repo.checkout(selected_ref)
  1083. except Exception:
  1084. # If checkout fails, abort
  1085. self.log.exception("Error checking out repo for line mapping")
  1086. warnings.append("Job %s: unable to check out repo "
  1087. "for file comments" % (args['zuul']['job']))
  1088. return
  1089. lines = filecomments.extractLines(fc)
  1090. new_lines = {}
  1091. for (filename, lineno) in lines:
  1092. try:
  1093. new_lineno = repo.mapLine(commit, filename, lineno)
  1094. except Exception as e:
  1095. # Log at debug level since it's likely a job issue
  1096. self.log.debug("Error mapping line:", exc_info=True)
  1097. if isinstance(e, git.GitCommandError):
  1098. msg = e.stderr
  1099. else:
  1100. msg = str(e)
  1101. warnings.append("Job %s: unable to map line "
  1102. "for file comments: %s" %
  1103. (args['zuul']['job'], msg))
  1104. new_lineno = None
  1105. if new_lineno is not None:
  1106. new_lines[(filename, lineno)] = new_lineno
  1107. filecomments.updateLines(fc, new_lines)
  1108. def doMergeChanges(self, merger, items, repo_state):
  1109. try:
  1110. ret = merger.mergeChanges(
  1111. items, repo_state=repo_state,
  1112. process_worker=self.executor_server.process_worker)
  1113. except ValueError:
  1114. # Return ABORTED so that we'll try again. At this point all of
  1115. # the refs we're trying to merge should be valid refs. If we
  1116. # can't fetch them, it should resolve itself.
  1117. self.log.exception("Could not fetch refs to merge from remote")
  1118. result = dict(result='ABORTED')
  1119. self.job.sendWorkComplete(json.dumps(result))
  1120. return None
  1121. if not ret: # merge conflict
  1122. result = dict(result='MERGER_FAILURE')
  1123. if self.executor_server.statsd:
  1124. base_key = "zuul.executor.{hostname}.merger"
  1125. self.executor_server.statsd.incr(base_key + ".FAILURE")
  1126. self.job.sendWorkComplete(json.dumps(result))
  1127. return None
  1128. if self.executor_server.statsd:
  1129. base_key = "zuul.executor.{hostname}.merger"
  1130. self.executor_server.statsd.incr(base_key + ".SUCCESS")
  1131. recent = ret[3]
  1132. orig_commit = ret[4]
  1133. for key, commit in recent.items():
  1134. (connection, project, branch) = key
  1135. # Compare the commit with the repo state. If it's included in the
  1136. # repo state and it's the same we've set this ref already earlier
  1137. # and don't have to set it again.
  1138. repo_state_project = repo_state.get(
  1139. connection, {}).get(project, {})
  1140. repo_state_commit = repo_state_project.get(
  1141. 'refs/heads/%s' % branch)
  1142. if repo_state_commit != commit:
  1143. repo = merger.getRepo(connection, project)
  1144. repo.setRef('refs/heads/' + branch, commit)
  1145. return orig_commit
  1146. def resolveBranch(self, project_canonical_name, ref, zuul_branch,
  1147. job_override_branch, job_override_checkout,
  1148. project_override_branch, project_override_checkout,
  1149. project_default_branch):
  1150. branches = self.project_info[project_canonical_name]['branches']
  1151. refs = self.project_info[project_canonical_name]['refs']
  1152. selected_ref = None
  1153. selected_desc = None
  1154. if project_override_checkout in refs:
  1155. selected_ref = project_override_checkout
  1156. selected_desc = 'project override ref'
  1157. elif project_override_branch in branches:
  1158. selected_ref = project_override_branch
  1159. selected_desc = 'project override branch'
  1160. elif job_override_checkout in refs:
  1161. selected_ref = job_override_checkout
  1162. selected_desc = 'job override ref'
  1163. elif job_override_branch in branches:
  1164. selected_ref = job_override_branch
  1165. selected_desc = 'job override branch'
  1166. elif ref and ref.startswith('refs/heads/'):
  1167. selected_ref = ref[len('refs/heads/'):]
  1168. selected_desc = 'branch ref'
  1169. elif ref and ref.startswith('refs/tags/'):
  1170. selected_ref = ref[len('refs/tags/'):]
  1171. selected_desc = 'tag ref'
  1172. elif zuul_branch and zuul_branch in branches:
  1173. selected_ref = zuul_branch
  1174. selected_desc = 'zuul branch'
  1175. elif project_default_branch in branches:
  1176. selected_ref = project_default_branch
  1177. selected_desc = 'project default branch'
  1178. else:
  1179. raise ExecutorError("Project %s does not have the "
  1180. "default branch %s" %
  1181. (project_canonical_name,
  1182. project_default_branch))
  1183. return (selected_ref, selected_desc)
  1184. def getAnsibleTimeout(self, start, timeout):
  1185. if timeout is not None:
  1186. now = time.time()
  1187. elapsed = now - start
  1188. timeout = timeout - elapsed
  1189. return timeout
  1190. def runPlaybooks(self, args):
  1191. result = None
  1192. with open(self.jobdir.job_output_file, 'a') as job_output:
  1193. job_output.write("{now} | Running Ansible setup...\n".format(
  1194. now=datetime.datetime.now()
  1195. ))
  1196. # Run the Ansible 'setup' module on all hosts in the inventory
  1197. # at the start of the job with a 60 second timeout. If we
  1198. # aren't able to connect to all the hosts and gather facts
  1199. # within that timeout, there is likely a network problem
  1200. # between here and the hosts in the inventory; return them and
  1201. # reschedule the job.
  1202. setup_status, setup_code = self.runAnsibleSetup(
  1203. self.jobdir.setup_playbook, self.ansible_version)
  1204. if setup_status != self.RESULT_NORMAL or setup_code != 0:
  1205. return result
  1206. pre_failed = False
  1207. success = False
  1208. if self.executor_server.statsd:
  1209. key = "zuul.executor.{hostname}.starting_builds"
  1210. self.executor_server.statsd.timing(
  1211. key, (time.monotonic() - self.time_starting_build) * 1000)
  1212. self.started = True
  1213. time_started = time.time()
  1214. # timeout value is "total" job timeout which accounts for
  1215. # pre-run and run playbooks. post-run is different because
  1216. # it is used to copy out job logs and we want to do our best
  1217. # to copy logs even when the job has timed out.
  1218. job_timeout = args['timeout']
  1219. for index, playbook in enumerate(self.jobdir.pre_playbooks):
  1220. # TODOv3(pabelanger): Implement pre-run timeout setting.
  1221. ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
  1222. pre_status, pre_code = self.runAnsiblePlaybook(
  1223. playbook, ansible_timeout, self.ansible_version, phase='pre',
  1224. index=index)
  1225. if pre_status != self.RESULT_NORMAL or pre_code != 0:
  1226. # These should really never fail, so return None and have
  1227. # zuul try again
  1228. pre_failed = True
  1229. break
  1230. self.log.debug(
  1231. "Overall ansible cpu times: user=%.2f, system=%.2f, "
  1232. "children_user=%.2f, children_system=%.2f" %
  1233. (self.cpu_times['user'], self.cpu_times['system'],
  1234. self.cpu_times['children_user'],
  1235. self.cpu_times['children_system']))
  1236. if not pre_failed:
  1237. for index, playbook in enumerate(self.jobdir.playbooks):
  1238. ansible_timeout = self.getAnsibleTimeout(
  1239. time_started, job_timeout)
  1240. job_status, job_code = self.runAnsiblePlaybook(
  1241. playbook, ansible_timeout, self.ansible_version,
  1242. phase='run', index=index)
  1243. if job_status == self.RESULT_ABORTED:
  1244. return 'ABORTED'
  1245. elif job_status == self.RESULT_TIMED_OUT:
  1246. # Set the pre-failure flag so this doesn't get
  1247. # overridden by a post-failure.
  1248. pre_failed = True
  1249. result = 'TIMED_OUT'
  1250. break
  1251. elif job_status == self.RESULT_NORMAL:
  1252. success = (job_code == 0)
  1253. if success:
  1254. result = 'SUCCESS'
  1255. else:
  1256. result = 'FAILURE'
  1257. break
  1258. else:
  1259. # The result of the job is indeterminate. Zuul will
  1260. # run it again.
  1261. return None
  1262. # check if we need to pause here
  1263. result_data = self.getResultData()
  1264. pause = result_data.get('zuul', {}).get('pause')
  1265. if success and pause:
  1266. self.pause()
  1267. if self.aborted:
  1268. return 'ABORTED'
  1269. post_timeout = args['post_timeout']
  1270. unreachable = False
  1271. for index, playbook in enumerate(self.jobdir.post_playbooks):
  1272. # Post timeout operates a little differently to the main job
  1273. # timeout. We give each post playbook the full post timeout to
  1274. # do its job because post is where you'll often record job logs
  1275. # which are vital to understanding why timeouts have happened in
  1276. # the first place.
  1277. post_status, post_code = self.runAnsiblePlaybook(
  1278. playbook, post_timeout, self.ansible_version, success,
  1279. phase='post', index=index)
  1280. if post_status == self.RESULT_ABORTED:
  1281. return 'ABORTED'
  1282. if post_status == self.RESULT_UNREACHABLE:
  1283. # In case we encounter unreachable nodes we need to return None
  1284. # so the job can be retried. However in the case of post
  1285. # playbooks we should still try to run all playbooks to get a
  1286. # chance to upload logs.
  1287. unreachable = True
  1288. if post_status != self.RESULT_NORMAL or post_code != 0:
  1289. success = False
  1290. # If we encountered a pre-failure, that takes
  1291. # precedence over the post result.
  1292. if not pre_failed:
  1293. result = 'POST_FAILURE'
  1294. if (index + 1) == len(self.jobdir.post_playbooks):
  1295. self._logFinalPlaybookError()
  1296. if unreachable:
  1297. return None
  1298. return result
  1299. def runCleanupPlaybooks(self, success):
  1300. if not self.jobdir.cleanup_playbooks:
  1301. return
  1302. # TODO: make this configurable
  1303. cleanup_timeout = 300
  1304. with open(self.jobdir.job_output_file, 'a') as job_output:
  1305. job_output.write("{now} | Running Ansible cleanup...\n".format(
  1306. now=datetime.datetime.now()
  1307. ))
  1308. self.cleanup_started = True
  1309. for index, playbook in enumerate(self.jobdir.cleanup_playbooks):
  1310. self.runAnsiblePlaybook(
  1311. playbook, cleanup_timeout, self.ansible_version,
  1312. success=success, phase='cleanup', index=index)
  1313. def _logFinalPlaybookError(self):
  1314. # Failures in the final post playbook can include failures
  1315. # uploading logs, which makes diagnosing issues difficult.
  1316. # Grab the output from the last playbook from the json
  1317. # file and log it.
  1318. json_output = self.jobdir.job_output_file.replace('txt', 'json')
  1319. self.log.debug("Final playbook failed")
  1320. if not os.path.exists(json_output):
  1321. self.log.debug("JSON logfile {logfile} is missing".format(
  1322. logfile=json_output))
  1323. return
  1324. try:
  1325. output = json.load(open(json_output, 'r'))
  1326. last_playbook = output[-1]
  1327. # Transform json to yaml - because it's easier to read and given
  1328. # the size of the data it'll be extra-hard to read this as an
  1329. # all on one line stringified nested dict.
  1330. yaml_out = yaml.safe_dump(last_playbook, default_flow_style=False)
  1331. for line in yaml_out.split('\n'):
  1332. self.log.debug(line)
  1333. except Exception:
  1334. self.log.exception(
  1335. "Could not decode json from {logfile}".format(
  1336. logfile=json_output))
  1337. def getHostList(self, args):
  1338. hosts = []
  1339. for node in args['nodes']:
  1340. # NOTE(mordred): This assumes that the nodepool launcher
  1341. # and the zuul executor both have similar network
  1342. # characteristics, as the launcher will do a test for ipv6
  1343. # viability and if so, and if the node has an ipv6
  1344. # address, it will be the interface_ip. force-ipv4 can be
  1345. # set to True in the clouds.yaml for a cloud if this
  1346. # results in the wrong thing being in interface_ip
  1347. # TODO(jeblair): Move this notice to the docs.
  1348. for name in node['name']:
  1349. ip = node.get('interface_ip')
  1350. port = node.get('connection_port', node.get('ssh_port', 22))
  1351. host_vars = args['host_vars'].get(name, {}).copy()
  1352. check_varnames(host_vars)
  1353. host_vars.update(dict(
  1354. ansible_host=ip,
  1355. ansible_user=self.executor_server.default_username,
  1356. ansible_port=port,
  1357. nodepool=dict(
  1358. label=node.get('label'),
  1359. az=node.get('az'),
  1360. cloud=node.get('cloud'),
  1361. provider=node.get('provider'),
  1362. region=node.get('region'),
  1363. host_id=node.get('host_id'),
  1364. interface_ip=node.get('interface_ip'),
  1365. public_ipv4=node.get('public_ipv4'),
  1366. private_ipv4=node.get('private_ipv4'),
  1367. public_ipv6=node.get('public_ipv6'))))
  1368. # Ansible >=2.8 introduced "auto" as an
  1369. # ansible_python_interpreter argument that looks up
  1370. # which python to use on the remote host in an inbuilt
  1371. # table and essentially "does the right thing"
  1372. # (i.e. chooses python3 on 3-only hosts like later
  1373. # Fedoras).
  1374. # If ansible_python_interpreter is set either as a group
  1375. # var or all-var, then don't do anything here; let the
  1376. # user control.
  1377. api = 'ansible_python_interpreter'
  1378. if (api not in args['vars'] and
  1379. not is_group_var_set(api, name, args)):
  1380. python = node.get('python_path', 'auto')
  1381. host_vars.setdefault(api, python)
  1382. username = node.get('username')
  1383. if username:
  1384. host_vars['ansible_user'] = username
  1385. connection_type = node.get('connection_type')
  1386. if connection_type:
  1387. host_vars['ansible_connection'] = connection_type
  1388. if connection_type == "winrm":
  1389. host_vars['ansible_winrm_transport'] = 'certificate'
  1390. host_vars['ansible_winrm_cert_pem'] = \
  1391. self.winrm_pem_file
  1392. host_vars['ansible_winrm_cert_key_pem'] = \
  1393. self.winrm_key_file
  1394. # NOTE(tobiash): This is necessary when using default
  1395. # winrm self-signed certificates. This is probably what
  1396. # most installations want so hard code this here for
  1397. # now.
  1398. host_vars['ansible_winrm_server_cert_validation'] = \
  1399. 'ignore'
  1400. if self.winrm_operation_timeout is not None:
  1401. host_vars['ansible_winrm_operation_timeout_sec'] =\
  1402. self.winrm_operation_timeout
  1403. if self.winrm_read_timeout is not None:
  1404. host_vars['ansible_winrm_read_timeout_sec'] = \
  1405. self.winrm_read_timeout
  1406. elif connection_type == "kubectl":
  1407. host_vars['ansible_kubectl_context'] = \
  1408. node.get('kubectl_context')
  1409. host_keys = []
  1410. for key in node.get('host_keys', []):
  1411. if port != 22:
  1412. host_keys.append("[%s]:%s %s" % (ip, port, key))
  1413. else:
  1414. host_keys.append("%s %s" % (ip, key))
  1415. if not node.get('host_keys'):
  1416. host_vars['ansible_ssh_common_args'] = \
  1417. '-o StrictHostKeyChecking=false'
  1418. hosts.append(dict(
  1419. name=name,
  1420. host_vars=host_vars,
  1421. host_keys=host_keys))
  1422. return hosts
  1423. def _blockPluginDirs(self, path):
  1424. '''Prevent execution of playbooks or roles with plugins
  1425. Plugins are loaded from roles and also if there is a plugin
  1426. dir adjacent to the playbook. Throw an error if the path
  1427. contains a location that would cause a plugin to get loaded.
  1428. '''
  1429. for entry in os.listdir(path):
  1430. entry = os.path.join(path, entry)
  1431. if os.path.isdir(entry) and entry.endswith('_plugins'):
  1432. raise PluginFoundError(
  1433. "Ansible plugin dir %s found adjacent to playbook %s in "
  1434. "non-trusted repo." % (entry, path))
  1435. def findPlaybook(self, path, trusted=False):
  1436. if os.path.exists(path):
  1437. if not trusted:
  1438. # Plugins can be defined in multiple locations within the
  1439. # playbook's subtree.
  1440. #
  1441. # 1. directly within the playbook:
  1442. # block playbook_dir/*_plugins
  1443. #
  1444. # 2. within a role defined in playbook_dir/<rolename>:
  1445. # block playbook_dir/*/*_plugins
  1446. #
  1447. # 3. within a role defined in playbook_dir/roles/<rolename>:
  1448. # block playbook_dir/roles/*/*_plugins
  1449. playbook_dir = os.path.dirname(os.path.abspath(path))
  1450. paths_to_check = []
  1451. def addPathsToCheck(root_dir):
  1452. if os.path.isdir(root_dir):
  1453. for entry in os.listdir(root_dir):
  1454. entry = os.path.join(root_dir, entry)
  1455. if os.path.isdir(entry):
  1456. paths_to_check.append(entry)
  1457. # handle case 1
  1458. paths_to_check.append(playbook_dir)
  1459. # handle case 2
  1460. addPathsToCheck(playbook_dir)
  1461. # handle case 3
  1462. addPathsToCheck(os.path.join(playbook_dir, 'roles'))
  1463. for path_to_check in paths_to_check:
  1464. self._blockPluginDirs(path_to_check)
  1465. return path
  1466. raise ExecutorError("Unable to find playbook %s" % path)
  1467. def preparePlaybooks(self, args):
  1468. self.writeAnsibleConfig(self.jobdir.setup_playbook)
  1469. for playbook in args['pre_playbooks']:
  1470. jobdir_playbook = self.jobdir.addPrePlaybook()
  1471. self.preparePlaybook(jobdir_playbook, playbook, args)
  1472. job_playbook = None
  1473. for playbook in args['playbooks']:
  1474. jobdir_playbook = self.jobdir.addPlaybook()
  1475. self.preparePlaybook(jobdir_playbook, playbook, args)
  1476. if jobdir_playbook.path is not None:
  1477. if job_playbook is None:
  1478. job_playbook = jobdir_playbook
  1479. if job_playbook is None:
  1480. raise ExecutorError("No playbook specified")
  1481. for playbook in args['post_playbooks']:
  1482. jobdir_playbook = self.jobdir.addPostPlaybook()
  1483. self.preparePlaybook(jobdir_playbook, playbook, args)
  1484. for playbook in args['cleanup_playbooks']:
  1485. jobdir_playbook = self.jobdir.addCleanupPlaybook()
  1486. self.preparePlaybook(jobdir_playbook, playbook, args)
  1487. def preparePlaybook(self, jobdir_playbook, playbook, args):
  1488. # Check out the playbook repo if needed and set the path to
  1489. # the playbook that should be run.
  1490. self.log.debug("Prepare playbook repo for %s: %s@%s" %
  1491. (playbook['trusted'] and 'trusted' or 'untrusted',
  1492. playbook['project'], playbook['branch']))
  1493. source = self.executor_server.connections.getSource(
  1494. playbook['connection'])
  1495. project = source.getProject(playbook['project'])
  1496. branch = playbook['branch']
  1497. jobdir_playbook.trusted = playbook['trusted']
  1498. jobdir_playbook.branch = branch
  1499. jobdir_playbook.project_canonical_name = project.canonical_name
  1500. jobdir_playbook.canonical_name_and_path = os.path.join(
  1501. project.canonical_name, playbook['path'])
  1502. path = None
  1503. if not jobdir_playbook.trusted:
  1504. path = self.checkoutUntrustedProject(project, branch, args)
  1505. else:
  1506. path = self.checkoutTrustedProject(project, branch)
  1507. path = os.path.join(path, playbook['path'])
  1508. jobdir_playbook.path = self.findPlaybook(
  1509. path,
  1510. trusted=jobdir_playbook.trusted)
  1511. # If this playbook doesn't exist, don't bother preparing
  1512. # roles.
  1513. if not jobdir_playbook.path:
  1514. return
  1515. for role in playbook['roles']:
  1516. self.prepareRole(jobdir_playbook, role, args)
  1517. secrets = playbook['secrets']
  1518. if secrets:
  1519. check_varnames(secrets)
  1520. jobdir_playbook.secrets_content = yaml.safe_dump(
  1521. secrets, default_flow_style=False)
  1522. self.writeAnsibleConfig(jobdir_playbook)
  1523. def checkoutTrustedProject(self, project, branch):
  1524. root = self.jobdir.getTrustedProject(project.canonical_name,
  1525. branch)
  1526. if not root:
  1527. root = self.jobdir.addTrustedProject(project.canonical_name,
  1528. branch)
  1529. self.log.debug("Cloning %s@%s into new trusted space %s",
  1530. project, branch, root)
  1531. merger = self.executor_server._getMerger(
  1532. root,
  1533. self.executor_server.merge_root,
  1534. self.log)
  1535. merger.checkoutBranch(project.connection_name, project.name,
  1536. branch)
  1537. else:
  1538. self.log.debug("Using existing repo %s@%s in trusted space %s",
  1539. project, branch, root)
  1540. path = os.path.join(root,
  1541. project.canonical_hostname,
  1542. project.name)
  1543. return path
  1544. def checkoutUntrustedProject(self, project, branch, args):
  1545. root = self.jobdir.getUntrustedProject(project.canonical_name,
  1546. branch)
  1547. if not root:
  1548. root = self.jobdir.addUntrustedProject(project.canonical_name,
  1549. branch)
  1550. # If the project is in the dependency chain, clone from
  1551. # there so we pick up any speculative changes, otherwise,
  1552. # clone from the cache.
  1553. merger = None
  1554. for p in args['projects']:
  1555. if (p['connection'] == project.connection_name and
  1556. p['name'] == project.name):
  1557. # We already have this repo prepared
  1558. self.log.debug("Found workdir repo for untrusted project")
  1559. merger = self.executor_server._getMerger(
  1560. root,
  1561. self.jobdir.src_root,
  1562. self.log)
  1563. break
  1564. if merger is None:
  1565. merger = self.executor_server._getMerger(
  1566. root,
  1567. self.executor_server.merge_root,
  1568. self.log)
  1569. self.log.debug("Cloning %s@%s into new untrusted space %s",
  1570. project, branch, root)
  1571. merger.checkoutBranch(project.connection_name, project.name,
  1572. branch)
  1573. else:
  1574. self.log.debug("Using existing repo %s@%s in trusted space %s",
  1575. project, branch, root)
  1576. path = os.path.join(root,
  1577. project.canonical_hostname,
  1578. project.name)
  1579. return path
  1580. def prepareRole(self, jobdir_playbook, role, args):
  1581. if role['type'] == 'zuul':
  1582. root = jobdir_playbook.addRole()
  1583. self.prepareZuulRole(jobdir_playbook, role, args, root)
  1584. def findRole(self, path, trusted=False):
  1585. d = os.path.join(path, 'tasks')
  1586. if os.path.isdir(d):
  1587. # This is a bare role
  1588. if not trusted:
  1589. self._blockPluginDirs(path)
  1590. # None signifies that the repo is a bare role
  1591. return None
  1592. d = os.path.join(path, 'roles')
  1593. if os.path.isdir(d):
  1594. # This repo has a collection of roles
  1595. if not trusted:
  1596. self._blockPluginDirs(d)
  1597. for entry in os.listdir(d):
  1598. entry_path = os.path.join(d, entry)
  1599. if os.path.isdir(entry_path):
  1600. self._blockPluginDirs(entry_path)
  1601. return d
  1602. # It is neither a bare role, nor a collection of roles
  1603. raise RoleNotFoundError("Unable to find role in %s" % (path,))
  1604. def prepareZuulRole(self, jobdir_playbook, role, args, root):
  1605. self.log.debug("Prepare zuul role for %s" % (role,))
  1606. # Check out the role repo if needed
  1607. source = self.executor_server.connections.getSource(
  1608. role['connection'])
  1609. project = source.getProject(role['project'])
  1610. name = role['target_name']
  1611. path = None
  1612. # Find the branch to use for this role. We should generally
  1613. # follow the normal fallback procedure, unless this role's
  1614. # project is the playbook's project, in which case we should
  1615. # use the playbook branch.
  1616. if jobdir_playbook.project_canonical_name == project.canonical_name:
  1617. branch = jobdir_playbook.branch
  1618. self.log.debug("Role project is playbook project, "
  1619. "using playbook branch %s", branch)
  1620. else:
  1621. # Find if the project is one of the job-specified projects.
  1622. # If it is, we can honor the project checkout-override options.
  1623. args_project = {}
  1624. for p in args['projects']:
  1625. if (p['canonical_name'] == project.canonical_name):
  1626. args_project = p
  1627. break
  1628. branch, selected_desc = self.resolveBranch(
  1629. project.canonical_name,
  1630. None,
  1631. args['branch'],
  1632. args['override_branch'],
  1633. args['override_checkout'],
  1634. args_project.get('override_branch'),
  1635. args_project.get('override_checkout'),
  1636. role['project_default_branch'])
  1637. self.log.debug("Role using %s %s", selected_desc, branch)
  1638. if not jobdir_playbook.trusted:
  1639. path = self.checkoutUntrustedProject(project, branch, args)
  1640. else:
  1641. path = self.checkoutTrustedProject(project, branch)
  1642. # The name of the symlink is the requested name of the role
  1643. # (which may be the repo name or may be something else; this
  1644. # can come into play if this is a bare role).
  1645. link = os.path.join(root, name)
  1646. link = os.path.realpath(link)
  1647. if not link.startswith(os.path.realpath(root)):
  1648. raise ExecutorError("Invalid role name %s" % name)
  1649. os.symlink(path, link)
  1650. try:
  1651. role_path = self.findRole(link, trusted=jobdir_playbook.trusted)
  1652. except RoleNotFoundError:
  1653. if role['implicit']:
  1654. self.log.debug("Implicit role not found in %s", link)
  1655. return
  1656. raise
  1657. except PluginFoundError:
  1658. if role['implicit']:
  1659. self.log.info("Not adding implicit role %s due to "
  1660. "plugin", link)
  1661. return
  1662. raise
  1663. if role_path is None:
  1664. # In the case of a bare role, add the containing directory
  1665. role_path = root
  1666. self.log.debug("Adding role path %s", role_path)
  1667. jobdir_playbook.roles_path.append(role_path)
  1668. def prepareKubeConfig(self, jobdir, data):
  1669. kube_cfg_path = jobdir.kubeconfig
  1670. if os.path.exists(kube_cfg_path):
  1671. kube_cfg = yaml.safe_load(open(kube_cfg_path))
  1672. else:
  1673. kube_cfg = {
  1674. 'apiVersion': 'v1',
  1675. 'kind': 'Config',
  1676. 'preferences': {},
  1677. 'users': [],
  1678. 'clusters': [],
  1679. 'contexts': [],
  1680. 'current-context': None,
  1681. }
  1682. # Add cluster
  1683. cluster_name = urlsplit(data['host']).netloc.replace('.', '-')
  1684. # Do not add a cluster/server that already exists in the kubeconfig
  1685. # because that leads to 'duplicate name' errors on multi-node builds.
  1686. # Also, as the cluster name directly corresponds to a server, there
  1687. # is no need to add it twice.
  1688. if cluster_name not in [c['name'] for c in kube_cfg['clusters']]:
  1689. cluster = {
  1690. 'server': data['host'],
  1691. }
  1692. if data.get('ca_crt'):
  1693. cluster['certificate-authority-data'] = data['ca_crt']
  1694. if data['skiptls']:
  1695. cluster['insecure-skip-tls-verify'] = True
  1696. kube_cfg['clusters'].append({
  1697. 'name': cluster_name,
  1698. 'cluster': cluster,
  1699. })
  1700. # Add user
  1701. user_name = "%s:%s" % (data['namespace'], data['user'])
  1702. kube_cfg['users'].append({
  1703. 'name': user_name,
  1704. 'user': {
  1705. 'token': data['token'],
  1706. },
  1707. })
  1708. # Add context
  1709. data['context_name'] = "%s/%s" % (user_name, cluster_name)
  1710. kube_cfg['contexts'].append({
  1711. 'name': data['context_name'],
  1712. 'context': {
  1713. 'user': user_name,
  1714. 'cluster': cluster_name,
  1715. 'namespace': data['namespace']
  1716. }
  1717. })
  1718. if not kube_cfg['current-context']:
  1719. kube_cfg['current-context'] = data['context_name']
  1720. with open(kube_cfg_path, "w") as of:
  1721. of.write(yaml.safe_dump(kube_cfg, default_flow_style=False))
  1722. def prepareAnsibleFiles(self, args):
  1723. all_vars = args['vars'].copy()
  1724. check_varnames(all_vars)
  1725. all_vars['zuul'] = args['zuul'].copy()
  1726. all_vars['zuul']['executor'] = dict(
  1727. hostname=self.executor_server.hostname,
  1728. src_root=self.jobdir.src_root,
  1729. log_root=self.jobdir.log_root,
  1730. work_root=self.jobdir.work_root,
  1731. result_data_file=self.jobdir.result_data_file,
  1732. inventory_file=self.jobdir.inventory)
  1733. resources_nodes = []
  1734. all_vars['zuul']['resources'] = {}
  1735. for node in args['nodes']:
  1736. if node.get('connection_type') in (
  1737. 'namespace', 'project', 'kubectl'):
  1738. # TODO: decrypt resource data using scheduler key
  1739. data = node['connection_port']
  1740. # Setup kube/config file
  1741. self.prepareKubeConfig(self.jobdir, data)
  1742. # Convert connection_port in kubectl connection parameters
  1743. node['connection_port'] = None
  1744. node['kubectl_namespace'] = data['namespace']
  1745. node['kubectl_context'] = data['context_name']
  1746. # Add node information to zuul_resources
  1747. all_vars['zuul']['resources'][node['name'][0]] = {
  1748. 'namespace': data['namespace'],
  1749. 'context': data['context_name'],
  1750. }
  1751. if node['connection_type'] in ('project', 'namespace'):
  1752. # Project are special nodes that are not the inventory
  1753. resources_nodes.append(node)
  1754. else:
  1755. # Add the real pod name to the resources_var
  1756. all_vars['zuul']['resources'][
  1757. node['name'][0]]['pod'] = data['pod']
  1758. fwd = KubeFwd(zuul_event_id=self.zuul_event_id,
  1759. build=self.job.unique,
  1760. kubeconfig=self.jobdir.kubeconfig,
  1761. context=data['context_name'],
  1762. namespace=data['namespace'],
  1763. pod=data['pod'])
  1764. try:
  1765. fwd.start()
  1766. self.port_forwards.append(fwd)
  1767. all_vars['zuul']['resources'][
  1768. node['name'][0]]['stream_port'] = fwd.port
  1769. except Exception:
  1770. self.log.exception("Unable to start port forward:")
  1771. self.log.error("Kubectl and socat are required for "
  1772. "streaming logs")
  1773. # Remove resource node from nodes list
  1774. for node in resources_nodes:
  1775. args['nodes'].remove(node)
  1776. nodes = self.getHostList(args)
  1777. setup_inventory = make_setup_inventory_dict(nodes)
  1778. inventory = make_inventory_dict(nodes, args, all_vars)
  1779. with open(self.jobdir.setup_inventory, 'w') as setup_inventory_yaml:
  1780. setup_inventory_yaml.write(
  1781. yaml.safe_dump(setup_inventory, default_flow_style=False))
  1782. with open(self.jobdir.inventory, 'w') as inventory_yaml:
  1783. inventory_yaml.write(
  1784. yaml.safe_dump(inventory, default_flow_style=False))
  1785. with open(self.jobdir.known_hosts, 'w') as known_hosts:
  1786. for node in nodes:
  1787. for key in node['host_keys']:
  1788. known_hosts.write('%s\n' % key)
  1789. with open(self.jobdir.extra_vars, 'w') as extra_vars:
  1790. extra_vars.write(
  1791. yaml.safe_dump(args['extra_vars'], default_flow_style=False))
  1792. def writeLoggingConfig(self):
  1793. self.log.debug("Writing logging config for job %s %s",
  1794. self.jobdir.job_output_file,
  1795. self.jobdir.logging_json)
  1796. logging_config = zuul.ansible.logconfig.JobLoggingConfig(
  1797. job_output_file=self.jobdir.job_output_file)
  1798. logging_config.writeJson(self.jobdir.logging_json)
  1799. def writeAnsibleConfig(self, jobdir_playbook):
  1800. trusted = jobdir_playbook.trusted
  1801. # TODO(mordred) This should likely be extracted into a more generalized
  1802. # mechanism for deployers being able to add callback
  1803. # plugins.
  1804. if self.ara_callbacks:
  1805. callback_path = '%s:%s' % (
  1806. self.callback_dir,
  1807. os.path.dirname(self.ara_callbacks))
  1808. else:
  1809. callback_path = self.callback_dir
  1810. with open(jobdir_playbook.ansible_config, 'w') as config:
  1811. config.write('[defaults]\n')
  1812. config.write('inventory = %s\n' % self.jobdir.inventory)
  1813. config.write('local_tmp = %s\n' % self.jobdir.local_tmp)
  1814. config.write('retry_files_enabled = False\n')
  1815. config.write('gathering = smart\n')
  1816. config.write('fact_caching = jsonfile\n')
  1817. config.write('fact_caching_connection = %s\n' %
  1818. self.jobdir.fact_cache)
  1819. config.write('library = %s\n'
  1820. % self.library_dir)
  1821. config.write('command_warnings = False\n')
  1822. config.write('callback_plugins = %s\n' % callback_path)
  1823. config.write('stdout_callback = zuul_stream\n')
  1824. config.write('filter_plugins = %s\n'
  1825. % self.filter_dir)
  1826. config.write('nocows = True\n') # save useless stat() calls
  1827. # bump the timeout because busy nodes may take more than
  1828. # 10s to respond
  1829. config.write('timeout = 30\n')
  1830. # We need the general action dir to make the zuul_return plugin
  1831. # available to every job.
  1832. action_dirs = [self.action_dir_general]
  1833. if not trusted:
  1834. # Untrusted jobs add the action dir which makes sure localhost
  1835. # modules are restricted where needed. Further the command
  1836. # plugin needs to be restricted and also inject zuul_log_id
  1837. # to make log streaming work.
  1838. action_dirs.append(self.action_dir)
  1839. config.write('lookup_plugins = %s\n'
  1840. % self.lookup_dir)
  1841. else:
  1842. # Trusted jobs add the actiontrusted dir which adds the
  1843. # unrestricted command plugin to inject zuul_log_id to make
  1844. # log streaming work.
  1845. action_dirs.append(self.action_dir_trusted)
  1846. config.write('action_plugins = %s\n'
  1847. % ':'.join(action_dirs))
  1848. if jobdir_playbook.roles_path:
  1849. config.write('roles_path = %s\n' % ':'.join(
  1850. jobdir_playbook.roles_path))
  1851. # On playbooks with secrets we want to prevent the
  1852. # printing of args since they may be passed to a task or a
  1853. # role. Otherwise, printing the args could be useful for
  1854. # debugging.
  1855. config.write('display_args_to_stdout = %s\n' %
  1856. str(not jobdir_playbook.secrets_content))
  1857. # Increase the internal poll interval of ansible.
  1858. # The default interval of 0.001s is optimized for interactive
  1859. # ui at the expense of CPU load. As we have a non-interactive
  1860. # automation use case a longer poll interval is more suitable
  1861. # and reduces CPU load of the ansible process.
  1862. config.write('internal_poll_interval = 0.01\n')
  1863. if self.ansible_callbacks:
  1864. config.write('callback_whitelist =\n')
  1865. for callback in self.ansible_callbacks.keys():
  1866. config.write(' %s,\n' % callback)
  1867. config.write('[ssh_connection]\n')
  1868. # NOTE(pabelanger): Try up to 3 times to run a task on a host, this
  1869. # helps to mitigate UNREACHABLE host errors with SSH.
  1870. config.write('retries = 3\n')
  1871. # NB: when setting pipelining = True, keep_remote_files
  1872. # must be False (the default). Otherwise it apparently
  1873. # will override the pipelining option and effectively
  1874. # disable it. Pipelining has a side effect of running the
  1875. # command without a tty (ie, without the -tt argument to
  1876. # ssh). We require this behavior so that if a job runs a
  1877. # command which expects interactive input on a tty (such
  1878. # as sudo) it does not hang.
  1879. config.write('pipelining = True\n')
  1880. config.write('control_path_dir = %s\n' % self.jobdir.control_path)
  1881. ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \
  1882. "-o ServerAliveInterval=60 " \
  1883. "-o UserKnownHostsFile=%s" % self.jobdir.known_hosts
  1884. config.write('ssh_args = %s\n' % ssh_args)
  1885. if self.ansible_callbacks:
  1886. for cb_name, cb_config in self.ansible_callbacks.items():
  1887. config.write("[callback_%s]\n" % cb_name)
  1888. for k, n in cb_config.items():
  1889. config.write("%s = %s\n" % (k, n))
  1890. def _ansibleTimeout(self, msg):
  1891. self.log.warning(msg)
  1892. self.abortRunningProc()
  1893. def abortRunningProc(self):
  1894. with self.proc_lock:
  1895. if self.proc and not self.cleanup_started:
  1896. self.log.debug("Abort: sending kill signal to job "
  1897. "process group")
  1898. try:
  1899. pgid = os.getpgid(self.proc.pid)
  1900. os.killpg(pgid, signal.SIGKILL)
  1901. except Exception:
  1902. self.log.exception(
  1903. "Exception while killing ansible process:")
  1904. elif self.proc and self.cleanup_started:
  1905. self.log.debug("Abort: cleanup is in progress")
  1906. else:
  1907. self.log.debug("Abort: no process is running")
  1908. def runAnsible(self, cmd, timeout, playbook, ansible_version,
  1909. wrapped=True, cleanup=False):
  1910. config_file = playbook.ansible_config
  1911. env_copy = {key: value
  1912. for key, value in os.environ.copy().items()
  1913. if not key.startswith("ZUUL_")}
  1914. env_copy.update(self.ssh_agent.env)
  1915. if self.ara_callbacks:
  1916. env_copy['ARA_LOG_CONFIG'] = self.jobdir.logging_json
  1917. env_copy['ZUUL_JOB_LOG_CONFIG'] = self.jobdir.logging_json
  1918. env_copy['ZUUL_JOBDIR'] = self.jobdir.root
  1919. if self.executor_server.log_console_port != DEFAULT_STREAM_PORT:
  1920. env_copy['ZUUL_CONSOLE_PORT'] = str(
  1921. self.executor_server.log_console_port)
  1922. env_copy['TMP'] = self.jobdir.local_tmp
  1923. pythonpath = env_copy.get('PYTHONPATH')
  1924. if pythonpath:
  1925. pythonpath = [pythonpath]
  1926. else:
  1927. pythonpath = []
  1928. ansible_dir = self.executor_server.ansible_manager.getAnsibleDir(
  1929. ansible_version)
  1930. pythonpath = [ansible_dir] + pythonpath
  1931. env_copy['PYTHONPATH'] = os.path.pathsep.join(pythonpath)
  1932. if playbook.trusted:
  1933. opt_prefix = 'trusted'
  1934. else:
  1935. opt_prefix = 'untrusted'
  1936. ro_paths = get_default(self.executor_server.config, 'executor',
  1937. '%s_ro_paths' % opt_prefix)
  1938. rw_paths = get_default(self.executor_server.config, 'executor',
  1939. '%s_rw_paths' % opt_prefix)
  1940. ro_paths = ro_paths.split(":") if ro_paths else []
  1941. rw_paths = rw_paths.split(":") if rw_paths else []
  1942. ro_paths.append(ansible_dir)
  1943. ro_paths.append(
  1944. self.executor_server.ansible_manager.getAnsibleInstallDir(
  1945. ansible_version))
  1946. ro_paths.append(self.jobdir.ansible_root)
  1947. ro_paths.append(self.jobdir.trusted_root)
  1948. ro_paths.append(self.jobdir.untrusted_root)
  1949. ro_paths.append(playbook.root)
  1950. rw_paths.append(self.jobdir.ansible_cache_root)
  1951. if self.executor_variables_file:
  1952. ro_paths.append(self.executor_variables_file)
  1953. secrets = {}
  1954. if playbook.secrets_content:
  1955. secrets[playbook.secrets] = playbook.secrets_content
  1956. if wrapped:
  1957. wrapper = self.executor_server.execution_wrapper
  1958. else:
  1959. wrapper = self.executor_server.connections.drivers['nullwrap']
  1960. context = wrapper.getExecutionContext(ro_paths, rw_paths, secrets)
  1961. popen = context.getPopen(
  1962. work_dir=self.jobdir.work_root,
  1963. ssh_auth_sock=env_copy.get('SSH_AUTH_SOCK'))
  1964. env_copy['ANSIBLE_CONFIG'] = config_file
  1965. # NOTE(pabelanger): Default HOME variable to jobdir.work_root, as it is
  1966. # possible we don't bind mount current zuul user home directory.
  1967. env_copy['HOME'] = self.jobdir.work_root
  1968. with self.proc_lock:
  1969. if self.aborted and not cleanup:
  1970. return (self.RESULT_ABORTED, None)
  1971. self.log.debug("Ansible command: ANSIBLE_CONFIG=%s ZUUL_JOBDIR=%s "
  1972. "ZUUL_JOB_LOG_CONFIG=%s PYTHONPATH=%s TMP=%s %s",
  1973. env_copy['ANSIBLE_CONFIG'],
  1974. env_copy['ZUUL_JOBDIR'],
  1975. env_copy['ZUUL_JOB_LOG_CONFIG'],
  1976. env_copy['PYTHONPATH'],
  1977. env_copy['TMP'],
  1978. " ".join(shlex.quote(c) for c in cmd))
  1979. self.proc = popen(
  1980. cmd,
  1981. cwd=self.jobdir.work_root,
  1982. stdin=subprocess.DEVNULL,
  1983. stdout=subprocess.PIPE,
  1984. stderr=subprocess.STDOUT,
  1985. start_new_session=True,
  1986. env=env_copy,
  1987. )
  1988. syntax_buffer = []
  1989. ret = None
  1990. if timeout:
  1991. watchdog = Watchdog(timeout, self._ansibleTimeout,
  1992. ("Ansible timeout exceeded: %s" % timeout,))
  1993. watchdog.start()
  1994. try:
  1995. ansible_log = get_annotated_logger(
  1996. logging.getLogger("zuul.AnsibleJob.output"),
  1997. self.zuul_event_id, build=self.job.unique)
  1998. # Use manual idx instead of enumerate so that RESULT lines
  1999. # don't count towards BUFFER_LINES_FOR_SYNTAX
  2000. idx = 0
  2001. for line in iter(self.proc.stdout.readline, b''):
  2002. if line.startswith(b'RESULT'):
  2003. # TODO(mordred) Process result commands if sent
  2004. continue
  2005. else:
  2006. idx += 1
  2007. if idx < BUFFER_LINES_FOR_SYNTAX:
  2008. syntax_buffer.append(line)
  2009. line = line[:1024].rstrip()
  2010. ansible_log.debug("Ansible output: %s" % (line,))
  2011. self.log.debug("Ansible output terminated")
  2012. try:
  2013. cpu_times = self.proc.cpu_times()
  2014. self.log.debug("Ansible cpu times: user=%.2f, system=%.2f, "
  2015. "children_user=%.2f, "
  2016. "children_system=%.2f" %
  2017. (cpu_times.user, cpu_times.system,
  2018. cpu_times.children_user,
  2019. cpu_times.children_system))
  2020. self.cpu_times['user'] += cpu_times.user
  2021. self.cpu_times['system'] += cpu_times.system
  2022. self.cpu_times['children_user'] += cpu_times.children_user
  2023. self.cpu_times['children_system'] += cpu_times.children_system
  2024. except psutil.NoSuchProcess:
  2025. self.log.warn("Cannot get cpu_times for process %d. Is your"
  2026. "/proc mounted with hidepid=2"
  2027. " on an old linux kernel?", self.proc.pid)
  2028. ret = self.proc.wait()
  2029. self.log.debug("Ansible exit code: %s" % (ret,))
  2030. finally:
  2031. if timeout:
  2032. watchdog.stop()
  2033. self.log.debug("Stopped watchdog")
  2034. self.log.debug("Stopped disk job killer")
  2035. with self.proc_lock:
  2036. self.proc.stdout.close()
  2037. self.proc = None
  2038. if timeout and watchdog.timed_out:
  2039. return (self.RESULT_TIMED_OUT, None)
  2040. # Note: Unlike documented ansible currently wrongly returns 4 on
  2041. # unreachable so we have the zuul_unreachable callback module that
  2042. # creates the file job-output.unreachable in case there were
  2043. # unreachable nodes. This can be removed once ansible returns a
  2044. # distinct value for unreachable.
  2045. if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file):
  2046. # AnsibleHostUnreachable: We had a network issue connecting to
  2047. # our zuul-worker.
  2048. return (self.RESULT_UNREACHABLE, None)
  2049. elif ret == -9:
  2050. # Received abort request.
  2051. return (self.RESULT_ABORTED, None)
  2052. elif ret == 1:
  2053. with open(self.jobdir.job_output_file, 'a') as job_output:
  2054. found_marker = False
  2055. for line in syntax_buffer:
  2056. if line.startswith(b'ERROR!'):
  2057. found_marker = True
  2058. if not found_marker:
  2059. continue
  2060. job_output.write("{now} | {line}\n".format(
  2061. now=datetime.datetime.now(),
  2062. line=line.decode('utf-8').rstrip()))
  2063. elif ret == 4:
  2064. # Ansible could not parse the yaml.
  2065. self.log.debug("Ansible parse error")
  2066. # TODO(mordred) If/when we rework use of logger in ansible-playbook
  2067. # we'll want to change how this works to use that as well. For now,
  2068. # this is what we need to do.
  2069. # TODO(mordred) We probably want to put this into the json output
  2070. # as well.
  2071. with open(self.jobdir.job_output_file, 'a') as job_output:
  2072. job_output.write("{now} | ANSIBLE PARSE ERROR\n".format(
  2073. now=datetime.datetime.now()))
  2074. for line in syntax_buffer:
  2075. job_output.write("{now} | {line}\n".format(
  2076. now=datetime.datetime.now(),
  2077. line=line.decode('utf-8').rstrip()))
  2078. elif ret == 250:
  2079. # Unexpected error from ansible
  2080. with open(self.jobdir.job_output_file, 'a') as job_output:
  2081. job_output.write("{now} | UNEXPECTED ANSIBLE ERROR\n".format(
  2082. now=datetime.datetime.now()))
  2083. found_marker = False
  2084. for line in syntax_buffer:
  2085. if line.startswith(b'ERROR! Unexpected Exception'):
  2086. found_marker = True
  2087. if not found_marker:
  2088. continue
  2089. job_output.write("{now} | {line}\n".format(
  2090. now=datetime.datetime.now(),
  2091. line=line.decode('utf-8').rstrip()))
  2092. elif ret == 2:
  2093. with open(self.jobdir.job_output_file, 'a') as job_output:
  2094. found_marker = False
  2095. for line in syntax_buffer:
  2096. # This is a workaround to detect winrm connection failures
  2097. # that are not detected by ansible. These can be detected
  2098. # if the string 'FATAL ERROR DURING FILE TRANSFER' is in
  2099. # the ansible output. In this case we should treat the
  2100. # host as unreachable and retry the job.
  2101. if b'FATAL ERROR DURING FILE TRANSFER' in line:
  2102. return self.RESULT_UNREACHABLE, None
  2103. # Extract errors for special cases that are treated like
  2104. # task errors by Ansible (e.g. missing role when using
  2105. # 'include_role').
  2106. if line.startswith(b'ERROR!'):
  2107. found_marker = True
  2108. if not found_marker:
  2109. continue
  2110. job_output.write("{now} | {line}\n".format(
  2111. now=datetime.datetime.now(),
  2112. line=line.decode('utf-8').rstrip()))
  2113. if self.aborted:
  2114. return (self.RESULT_ABORTED, None)
  2115. return (self.RESULT_NORMAL, ret)
  2116. def runAnsibleSetup(self, playbook, ansible_version):
  2117. if self.executor_server.verbose:
  2118. verbose = '-vvv'
  2119. else:
  2120. verbose = '-v'
  2121. # TODO: select correct ansible version from job
  2122. ansible = self.executor_server.ansible_manager.getAnsibleCommand(
  2123. ansible_version,
  2124. command='ansible')
  2125. cmd = [ansible, '*', verbose, '-m', 'setup',
  2126. '-i', self.jobdir.setup_inventory,
  2127. '-a', 'gather_subset=!all']
  2128. if self.executor_variables_file is not None:
  2129. cmd.extend(['-e@%s' % self.executor_variables_file])
  2130. result, code = self.runAnsible(
  2131. cmd=cmd, timeout=self.executor_server.setup_timeout,
  2132. playbook=playbook, ansible_version=ansible_version, wrapped=False)
  2133. self.log.debug("Ansible complete, result %s code %s" % (
  2134. self.RESULT_MAP[result], code))
  2135. if self.executor_server.statsd:
  2136. base_key = "zuul.executor.{hostname}.phase.setup"
  2137. self.executor_server.statsd.incr(base_key + ".%s" %
  2138. self.RESULT_MAP[result])
  2139. return result, code
  2140. def runAnsibleCleanup(self, playbook):
  2141. # TODO(jeblair): This requires a bugfix in Ansible 2.4
  2142. # Once this is used, increase the controlpersist timeout.
  2143. return (self.RESULT_NORMAL, 0)
  2144. if self.executor_server.verbose:
  2145. verbose = '-vvv'
  2146. else:
  2147. verbose = '-v'
  2148. cmd = ['ansible', '*', verbose, '-m', 'meta',
  2149. '-a', 'reset_connection']
  2150. result, code = self.runAnsible(
  2151. cmd=cmd, timeout=60, playbook=playbook,
  2152. wrapped=False)
  2153. self.log.debug("Ansible complete, result %s code %s" % (
  2154. self.RESULT_MAP[result], code))
  2155. if self.executor_server.statsd:
  2156. base_key = "zuul.executor.{hostname}.phase.cleanup"
  2157. self.executor_server.statsd.incr(base_key + ".%s" %
  2158. self.RESULT_MAP[result])
  2159. return result, code
  2160. def emitPlaybookBanner(self, playbook, step, phase, result=None):
  2161. # This is used to print a header and a footer, respectively at the
  2162. # beginning and the end of each playbook execution.
  2163. # We are doing it from the executor rather than from a callback because
  2164. # the parameters are not made available to the callback until it's too
  2165. # late.
  2166. phase = phase or ''
  2167. trusted = playbook.trusted
  2168. trusted = 'trusted' if trusted else 'untrusted'
  2169. branch = playbook.branch
  2170. playbook = playbook.canonical_name_and_path
  2171. if phase and phase != 'run':
  2172. phase = '{phase}-run'.format(phase=phase)
  2173. phase = phase.upper()
  2174. if result is not None:
  2175. result = self.RESULT_MAP[result]
  2176. msg = "{phase} {step} {result}: [{trusted} : {playbook}@{branch}]"
  2177. msg = msg.format(phase=phase, step=step, result=result,
  2178. trusted=trusted, playbook=playbook, branch=branch)
  2179. else:
  2180. msg = "{phase} {step}: [{trusted} : {playbook}@{branch}]"
  2181. msg = msg.format(phase=phase, step=step, trusted=trusted,
  2182. playbook=playbook, branch=branch)
  2183. with open(self.jobdir.job_output_file, 'a') as job_output:
  2184. job_output.write("{now} | {msg}\n".format(
  2185. now=datetime.datetime.now(),
  2186. msg=msg))
  2187. def runAnsiblePlaybook(self, playbook, timeout, ansible_version,
  2188. success=None, phase=None, index=None):
  2189. if self.executor_server.verbose:
  2190. verbose = '-vvv'
  2191. else:
  2192. verbose = '-v'
  2193. cmd = [self.executor_server.ansible_manager.getAnsibleCommand(
  2194. ansible_version), verbose, playbook.path]
  2195. if playbook.secrets_content:
  2196. cmd.extend(['-e', '@' + playbook.secrets])
  2197. cmd.extend(['-e', '@' + self.jobdir.extra_vars])
  2198. if success is not None:
  2199. cmd.extend(['-e', 'zuul_success=%s' % str(bool(success))])
  2200. if phase:
  2201. cmd.extend(['-e', 'zuul_execution_phase=%s' % phase])
  2202. if index is not None:
  2203. cmd.extend(['-e', 'zuul_execution_phase_index=%s' % index])
  2204. cmd.extend(['-e', 'zuul_execution_trusted=%s' % str(playbook.trusted)])
  2205. cmd.extend([
  2206. '-e',
  2207. 'zuul_execution_canonical_name_and_path=%s'
  2208. % playbook.canonical_name_and_path])
  2209. cmd.extend(['-e', 'zuul_execution_branch=%s' % str(playbook.branch)])
  2210. if self.executor_variables_file is not None:
  2211. cmd.extend(['-e@%s' % self.executor_variables_file])
  2212. if not playbook.trusted:
  2213. cmd.extend(['-e', '@' + self.jobdir.ansible_vars_blacklist])
  2214. self.emitPlaybookBanner(playbook, 'START', phase)
  2215. result, code = self.runAnsible(cmd, timeout, playbook, ansible_version,
  2216. cleanup=phase == 'cleanup')
  2217. self.log.debug("Ansible complete, result %s code %s" % (
  2218. self.RESULT_MAP[result], code))
  2219. if self.executor_server.statsd:
  2220. base_key = "zuul.executor.{hostname}.phase.{phase}"
  2221. self.executor_server.statsd.incr(
  2222. base_key + ".{result}",
  2223. result=self.RESULT_MAP[result],
  2224. phase=phase or 'unknown')
  2225. self.emitPlaybookBanner(playbook, 'END', phase, result=result)
  2226. return result, code
  2227. class ExecutorMergeWorker(gear.TextWorker):
  2228. def __init__(self, executor_server, *args, **kw):
  2229. self.zuul_executor_server = executor_server
  2230. super(ExecutorMergeWorker, self).__init__(*args, **kw)
  2231. def handleNoop(self, packet):
  2232. # Wait until the update queue is empty before responding
  2233. while self.zuul_executor_server.update_queue.qsize():
  2234. time.sleep(1)
  2235. super(ExecutorMergeWorker, self).handleNoop(packet)
  2236. class ExecutorExecuteWorker(gear.TextWorker):
  2237. def __init__(self, executor_server, *args, **kw):
  2238. self.zuul_executor_server = executor_server
  2239. super(ExecutorExecuteWorker, self).__init__(*args, **kw)
  2240. def handleNoop(self, packet):
  2241. # Delay our response to running a new job based on the number
  2242. # of jobs we're currently running, in an attempt to spread
  2243. # load evenly among executors.
  2244. workers = len(self.zuul_executor_server.job_workers)
  2245. delay = (workers ** 2) / 1000.0
  2246. time.sleep(delay)
  2247. return super(ExecutorExecuteWorker, self).handleNoop(packet)
  2248. class ExecutorServer(BaseMergeServer):
  2249. log = logging.getLogger("zuul.ExecutorServer")
  2250. _ansible_manager_class = AnsibleManager
  2251. _job_class = AnsibleJob
  2252. _repo_locks_class = RepoLocks
  2253. def __init__(self, config, connections=None, jobdir_root=None,
  2254. keep_jobdir=False, log_streaming_port=DEFAULT_FINGER_PORT,
  2255. log_console_port=DEFAULT_STREAM_PORT):
  2256. super().__init__(config, 'executor', connections)
  2257. self.keep_jobdir = keep_jobdir
  2258. self.jobdir_root = jobdir_root
  2259. # TODOv3(mordred): make the executor name more unique --
  2260. # perhaps hostname+pid.
  2261. self.hostname = get_default(self.config, 'executor', 'hostname',
  2262. socket.getfqdn())
  2263. self.log_streaming_port = log_streaming_port
  2264. self.governor_lock = threading.Lock()
  2265. self.run_lock = threading.Lock()
  2266. self.verbose = False
  2267. self.command_map = dict(
  2268. stop=self.stop,
  2269. pause=self.pause,
  2270. unpause=self.unpause,
  2271. graceful=self.graceful,
  2272. verbose=self.verboseOn,
  2273. unverbose=self.verboseOff,
  2274. keep=self.keep,
  2275. nokeep=self.nokeep,
  2276. repl=self.start_repl,
  2277. norepl=self.stop_repl,
  2278. )
  2279. self.log_console_port = log_console_port
  2280. self.repl = None
  2281. statsd_extra_keys = {'hostname': self.hostname}
  2282. self.statsd = get_statsd(config, statsd_extra_keys)
  2283. self.default_username = get_default(self.config, 'executor',
  2284. 'default_username', 'zuul')
  2285. self.disk_limit_per_job = int(get_default(self.config, 'executor',
  2286. 'disk_limit_per_job', 250))
  2287. self.setup_timeout = int(get_default(self.config, 'executor',
  2288. 'ansible_setup_timeout', 60))
  2289. self.zone = get_default(self.config, 'executor', 'zone')
  2290. self.ansible_callbacks = {}
  2291. for section_name in self.config.sections():
  2292. cb_match = re.match(r'^ansible_callback ([\'\"]?)(.*)(\1)$',
  2293. section_name, re.I)
  2294. if not cb_match:
  2295. continue
  2296. cb_name = cb_match.group(2)
  2297. self.ansible_callbacks[cb_name] = dict(
  2298. self.config.items(section_name)
  2299. )
  2300. # TODO(tobiash): Take cgroups into account
  2301. self.update_workers = multiprocessing.cpu_count()
  2302. self.update_threads = []
  2303. # If the execution driver ever becomes configurable again,
  2304. # this is where it would happen.
  2305. execution_wrapper_name = 'bubblewrap'
  2306. self.accepting_work = False
  2307. self.execution_wrapper = connections.drivers[execution_wrapper_name]
  2308. self.update_queue = DeduplicateQueue()
  2309. command_socket = get_default(
  2310. self.config, 'executor', 'command_socket',
  2311. '/var/lib/zuul/executor.socket')
  2312. self.command_socket = commandsocket.CommandSocket(command_socket)
  2313. state_dir = get_default(self.config, 'executor', 'state_dir',
  2314. '/var/lib/zuul', expand_user=True)
  2315. # If keep is not set, ensure the job dir is empty on startup,
  2316. # in case we were uncleanly shut down.
  2317. if not self.keep_jobdir:
  2318. for fn in os.listdir(self.jobdir_root):
  2319. fn = os.path.join(self.jobdir_root, fn)
  2320. if not os.path.isdir(fn):
  2321. continue
  2322. self.log.info("Deleting stale jobdir %s", fn)
  2323. # We use rm here instead of shutil because of
  2324. # https://bugs.python.org/issue22040
  2325. jobdir = os.path.join(self.jobdir_root, fn)
  2326. # First we need to ensure all directories are
  2327. # writable to avoid permission denied error
  2328. subprocess.Popen([
  2329. "find", jobdir,
  2330. # Filter non writable perms
  2331. "-type", "d", "!", "-perm", "/u+w",
  2332. # Replace by writable perms
  2333. "-exec", "chmod", "0700", "{}", "+"]).wait()
  2334. if subprocess.Popen(["rm", "-Rf", jobdir]).wait():
  2335. raise RuntimeError("Couldn't delete: " + jobdir)
  2336. self.job_workers = {}
  2337. self.disk_accountant = DiskAccountant(self.jobdir_root,
  2338. self.disk_limit_per_job,
  2339. self.stopJobDiskFull,
  2340. self.merge_root)
  2341. self.pause_sensor = PauseSensor(get_default(self.config, 'executor',
  2342. 'paused_on_start', False))
  2343. self.log.info("Starting executor (hostname: %s) in %spaused mode" % (
  2344. self.hostname, "" if self.pause_sensor.pause else "un"))
  2345. cpu_sensor = CPUSensor(config)
  2346. self.sensors = [
  2347. cpu_sensor,
  2348. HDDSensor(config),
  2349. self.pause_sensor,
  2350. RAMSensor(config),
  2351. StartingBuildsSensor(self, cpu_sensor.max_load_avg, config)
  2352. ]
  2353. manage_ansible = get_default(
  2354. self.config, 'executor', 'manage_ansible', True)
  2355. ansible_dir = os.path.join(state_dir, 'ansible')
  2356. ansible_install_root = get_default(
  2357. self.config, 'executor', 'ansible_root', None)
  2358. if not ansible_install_root:
  2359. # NOTE: Even though we set this value the zuul installation
  2360. # adjacent virtualenv location is still checked by the ansible
  2361. # manager. ansible_install_root's value is only used if those
  2362. # default locations do not have venvs preinstalled.
  2363. ansible_install_root = os.path.join(state_dir, 'ansible-bin')
  2364. self.ansible_manager = self._ansible_manager_class(
  2365. ansible_dir, runtime_install_root=ansible_install_root)
  2366. if not self.ansible_manager.validate():
  2367. if not manage_ansible:
  2368. raise Exception('Error while validating ansible '
  2369. 'installations. Please run '
  2370. 'zuul-manage-ansible to install all supported '
  2371. 'ansible versions.')
  2372. else:
  2373. self.ansible_manager.install()
  2374. self.ansible_manager.copyAnsibleFiles()
  2375. self.process_merge_jobs = get_default(self.config, 'executor',
  2376. 'merge_jobs', True)
  2377. function_name = 'executor:execute'
  2378. if self.zone:
  2379. function_name += ':%s' % self.zone
  2380. self.executor_jobs = {
  2381. "executor:resume:%s" % self.hostname: self.resumeJob,
  2382. "executor:stop:%s" % self.hostname: self.stopJob,
  2383. function_name: self.executeJob,
  2384. }
  2385. self.executor_gearworker = ZuulGearWorker(
  2386. 'Zuul Executor Server',
  2387. 'zuul.ExecutorServer.ExecuteWorker',
  2388. 'executor',
  2389. self.config,
  2390. self.executor_jobs,
  2391. worker_class=ExecutorExecuteWorker,
  2392. worker_args=[self])
  2393. # Used to offload expensive operations to different processes
  2394. self.process_worker = None
  2395. def _repoLock(self, connection_name, project_name):
  2396. return self.repo_locks.getRepoLock(connection_name, project_name)
  2397. def start(self):
  2398. # Start merger worker only if we process merge jobs
  2399. if self.process_merge_jobs:
  2400. super().start()
  2401. self._running = True
  2402. self._command_running = True
  2403. try:
  2404. multiprocessing.set_start_method('spawn')
  2405. except RuntimeError:
  2406. # Note: During tests this can be called multiple times which
  2407. # results in a runtime error. This is ok here as we've set this
  2408. # already correctly.
  2409. self.log.warning('Multiprocessing context has already been set')
  2410. self.process_worker = ProcessPoolExecutor()
  2411. self.executor_gearworker.start()
  2412. self.log.debug("Starting command processor")
  2413. self.command_socket.start()
  2414. self.command_thread = threading.Thread(target=self.runCommand,
  2415. name='command')
  2416. self.command_thread.daemon = True
  2417. self.command_thread.start()
  2418. self.log.debug("Starting %s update workers" % self.update_workers)
  2419. for i in range(self.update_workers):
  2420. update_thread = threading.Thread(target=self._updateLoop,
  2421. name='update')
  2422. update_thread.daemon = True
  2423. update_thread.start()
  2424. self.update_threads.append(update_thread)
  2425. self.governor_stop_event = threading.Event()
  2426. self.governor_thread = threading.Thread(target=self.run_governor,
  2427. name='governor')
  2428. self.governor_thread.daemon = True
  2429. self.governor_thread.start()
  2430. self.disk_accountant.start()
  2431. def register_work(self):
  2432. if self._running:
  2433. self.accepting_work = True
  2434. function_name = 'executor:execute'
  2435. if self.zone:
  2436. function_name += ':%s' % self.zone
  2437. self.executor_gearworker.gearman.registerFunction(function_name)
  2438. # TODO(jeblair): Update geard to send a noop after
  2439. # registering for a job which is in the queue, then remove
  2440. # this API violation.
  2441. self.executor_gearworker.gearman._sendGrabJobUniq()
  2442. def unregister_work(self):
  2443. self.accepting_work = False
  2444. function_name = 'executor:execute'
  2445. if self.zone:
  2446. function_name += ':%s' % self.zone
  2447. self.executor_gearworker.gearman.unRegisterFunction(function_name)
  2448. def stop(self):
  2449. self.log.debug("Stopping")
  2450. self.connections.stop()
  2451. self.disk_accountant.stop()
  2452. # The governor can change function registration, so make sure
  2453. # it has stopped.
  2454. self.governor_stop_event.set()
  2455. self.governor_thread.join()
  2456. # Stop accepting new jobs
  2457. if self.merger_gearworker is not None:
  2458. self.merger_gearworker.gearman.setFunctions([])
  2459. self.executor_gearworker.gearman.setFunctions([])
  2460. # Tell the executor worker to abort any jobs it just accepted,
  2461. # and grab the list of currently running job workers.
  2462. with self.run_lock:
  2463. self._running = False
  2464. self._command_running = False
  2465. workers = list(self.job_workers.values())
  2466. for job_worker in workers:
  2467. try:
  2468. job_worker.stop()
  2469. except Exception:
  2470. self.log.exception("Exception sending stop command "
  2471. "to worker:")
  2472. for job_worker in workers:
  2473. try:
  2474. job_worker.wait()
  2475. except Exception:
  2476. self.log.exception("Exception waiting for worker "
  2477. "to stop:")
  2478. # Now that we aren't accepting any new jobs, and all of the
  2479. # running jobs have stopped, tell the update processor to
  2480. # stop.
  2481. for _ in self.update_threads:
  2482. self.update_queue.put(None)
  2483. self.command_socket.stop()
  2484. # All job results should have been sent by now, shutdown the
  2485. # gearman workers.
  2486. if self.process_merge_jobs:
  2487. super().stop()
  2488. self.executor_gearworker.stop()
  2489. if self.process_worker is not None:
  2490. self.process_worker.shutdown()
  2491. if self.statsd:
  2492. base_key = 'zuul.executor.{hostname}'
  2493. self.statsd.gauge(base_key + '.load_average', 0)
  2494. self.statsd.gauge(base_key + '.pct_used_ram', 0)
  2495. self.statsd.gauge(base_key + '.running_builds', 0)
  2496. self.stop_repl()
  2497. self.log.debug("Stopped")
  2498. def join(self):
  2499. self.governor_thread.join()
  2500. for update_thread in self.update_threads:
  2501. update_thread.join()
  2502. if self.process_merge_jobs:
  2503. super().join()
  2504. self.executor_gearworker.join()
  2505. self.command_thread.join()
  2506. def pause(self):
  2507. self.log.debug('Pausing')
  2508. self.pause_sensor.pause = True
  2509. if self.process_merge_jobs:
  2510. super().pause()
  2511. def unpause(self):
  2512. self.log.debug('Resuming')
  2513. self.pause_sensor.pause = False
  2514. if self.process_merge_jobs:
  2515. super().unpause()
  2516. def graceful(self):
  2517. # This pauses the executor end shuts it down when there is no running
  2518. # build left anymore
  2519. self.log.info('Stopping graceful')
  2520. self.pause()
  2521. while self.job_workers:
  2522. self.log.debug('Waiting for %s jobs to end', len(self.job_workers))
  2523. time.sleep(30)
  2524. try:
  2525. self.stop()
  2526. except Exception:
  2527. self.log.exception('Error while stopping')
  2528. def verboseOn(self):
  2529. self.verbose = True
  2530. def verboseOff(self):
  2531. self.verbose = False
  2532. def keep(self):
  2533. self.keep_jobdir = True
  2534. def nokeep(self):
  2535. self.keep_jobdir = False
  2536. def start_repl(self):
  2537. if self.repl:
  2538. return
  2539. self.repl = zuul.lib.repl.REPLServer(self)
  2540. self.repl.start()
  2541. def stop_repl(self):
  2542. if not self.repl:
  2543. # not running
  2544. return
  2545. self.repl.stop()
  2546. self.repl = None
  2547. def runCommand(self):
  2548. while self._command_running:
  2549. try:
  2550. command = self.command_socket.get().decode('utf8')
  2551. if command != '_stop':
  2552. self.command_map[command]()
  2553. except Exception:
  2554. self.log.exception("Exception while processing command")
  2555. def _updateLoop(self):
  2556. while True:
  2557. try:
  2558. self._innerUpdateLoop()
  2559. except StopException:
  2560. return
  2561. except Exception:
  2562. self.log.exception("Exception in update thread:")
  2563. def resetProcessPool(self):
  2564. """
  2565. This is called in order to re-initialize a broken process pool if it
  2566. got broken e.g. by an oom killed child process
  2567. """
  2568. if self.process_worker:
  2569. try:
  2570. self.process_worker.shutdown()
  2571. except Exception:
  2572. self.log.exception('Failed to shutdown broken process worker')
  2573. self.process_worker = ProcessPoolExecutor()
  2574. def _innerUpdateLoop(self):
  2575. # Inside of a loop that keeps the main repositories up to date
  2576. task = self.update_queue.get()
  2577. if task is None:
  2578. # We are asked to stop
  2579. raise StopException()
  2580. log = get_annotated_logger(
  2581. self.log, task.zuul_event_id, build=task.build)
  2582. try:
  2583. lock = self.repo_locks.getRepoLock(
  2584. task.connection_name, task.project_name)
  2585. with lock:
  2586. log.info("Updating repo %s/%s",
  2587. task.connection_name, task.project_name)
  2588. self.merger.updateRepo(
  2589. task.connection_name, task.project_name,
  2590. repo_state=task.repo_state,
  2591. zuul_event_id=task.zuul_event_id, build=task.build,
  2592. process_worker=self.process_worker)
  2593. repo = self.merger.getRepo(
  2594. task.connection_name, task.project_name)
  2595. source = self.connections.getSource(task.connection_name)
  2596. project = source.getProject(task.project_name)
  2597. task.canonical_name = project.canonical_name
  2598. task.branches = repo.getBranches()
  2599. task.refs = [r.name for r in repo.getRefs()]
  2600. log.debug("Finished updating repo %s/%s",
  2601. task.connection_name, task.project_name)
  2602. task.success = True
  2603. except BrokenProcessPool:
  2604. # The process pool got broken. Reset it to unbreak it for further
  2605. # requests.
  2606. log.exception('Process pool got broken')
  2607. self.resetProcessPool()
  2608. except Exception:
  2609. log.exception('Got exception while updating repo %s/%s',
  2610. task.connection_name, task.project_name)
  2611. finally:
  2612. task.setComplete()
  2613. def update(self, connection_name, project_name, repo_state=None,
  2614. zuul_event_id=None, build=None):
  2615. # Update a repository in the main merger
  2616. state = None
  2617. if repo_state:
  2618. state = repo_state.get(connection_name, {}).get(project_name)
  2619. task = UpdateTask(connection_name, project_name, repo_state=state,
  2620. zuul_event_id=zuul_event_id, build=build)
  2621. task = self.update_queue.put(task)
  2622. return task
  2623. def _update(self, connection_name, project_name, zuul_event_id=None):
  2624. """
  2625. The executor overrides _update so it can do the update asynchronously.
  2626. """
  2627. log = get_annotated_logger(self.log, zuul_event_id)
  2628. task = self.update(connection_name, project_name,
  2629. zuul_event_id=zuul_event_id)
  2630. task.wait()
  2631. if not task.success:
  2632. msg = "Update of '{}' failed".format(project_name)
  2633. log.error(msg)
  2634. raise Exception(msg)
  2635. def executeJob(self, job):
  2636. args = json.loads(job.arguments)
  2637. zuul_event_id = args.get('zuul_event_id')
  2638. log = get_annotated_logger(self.log, zuul_event_id)
  2639. log.debug("Got %s job: %s", job.name, job.unique)
  2640. if self.statsd:
  2641. base_key = 'zuul.executor.{hostname}'
  2642. self.statsd.incr(base_key + '.builds')
  2643. self.job_workers[job.unique] = self._job_class(self, job)
  2644. # Run manageLoad before starting the thread mostly for the
  2645. # benefit of the unit tests to make the calculation of the
  2646. # number of starting jobs more deterministic.
  2647. self.manageLoad()
  2648. self.job_workers[job.unique].run()
  2649. def run_governor(self):
  2650. while not self.governor_stop_event.wait(10):
  2651. try:
  2652. self.manageLoad()
  2653. except Exception:
  2654. self.log.exception("Exception in governor thread:")
  2655. def manageLoad(self):
  2656. ''' Apply some heuristics to decide whether or not we should
  2657. be asking for more jobs '''
  2658. with self.governor_lock:
  2659. return self._manageLoad()
  2660. def _manageLoad(self):
  2661. if self.accepting_work:
  2662. # Don't unregister if we don't have any active jobs.
  2663. for sensor in self.sensors:
  2664. ok, message = sensor.isOk()
  2665. if not ok:
  2666. self.log.info(
  2667. "Unregistering due to {}".format(message))
  2668. self.unregister_work()
  2669. break
  2670. else:
  2671. reregister = True
  2672. limits = []
  2673. for sensor in self.sensors:
  2674. ok, message = sensor.isOk()
  2675. limits.append(message)
  2676. if not ok:
  2677. reregister = False
  2678. break
  2679. if reregister:
  2680. self.log.info("Re-registering as job is within its limits "
  2681. "{}".format(", ".join(limits)))
  2682. self.register_work()
  2683. if self.statsd:
  2684. base_key = 'zuul.executor.{hostname}'
  2685. for sensor in self.sensors:
  2686. sensor.reportStats(self.statsd, base_key)
  2687. def finishJob(self, unique):
  2688. del(self.job_workers[unique])
  2689. def stopJobDiskFull(self, jobdir):
  2690. unique = os.path.basename(jobdir)
  2691. self.stopJobByUnique(unique, reason=AnsibleJob.RESULT_DISK_FULL)
  2692. def resumeJob(self, job):
  2693. try:
  2694. args = json.loads(job.arguments)
  2695. zuul_event_id = args.get('zuul_event_id')
  2696. log = get_annotated_logger(self.log, zuul_event_id)
  2697. log.debug("Resume job with arguments: %s", args)
  2698. unique = args['uuid']
  2699. self.resumeJobByUnique(unique, zuul_event_id=zuul_event_id)
  2700. finally:
  2701. job.sendWorkComplete()
  2702. def stopJob(self, job):
  2703. try:
  2704. args = json.loads(job.arguments)
  2705. zuul_event_id = args.get('zuul_event_id')
  2706. log = get_annotated_logger(self.log, zuul_event_id)
  2707. log.debug("Stop job with arguments: %s", args)
  2708. unique = args['uuid']
  2709. self.stopJobByUnique(unique, zuul_event_id=zuul_event_id)
  2710. finally:
  2711. job.sendWorkComplete()
  2712. def resumeJobByUnique(self, unique, zuul_event_id=None):
  2713. log = get_annotated_logger(self.log, zuul_event_id)
  2714. job_worker = self.job_workers.get(unique)
  2715. if not job_worker:
  2716. log.debug("Unable to find worker for job %s", unique)
  2717. return
  2718. try:
  2719. job_worker.resume()
  2720. except Exception:
  2721. log.exception("Exception sending resume command to worker:")
  2722. def stopJobByUnique(self, unique, reason=None, zuul_event_id=None):
  2723. log = get_annotated_logger(self.log, zuul_event_id)
  2724. job_worker = self.job_workers.get(unique)
  2725. if not job_worker:
  2726. log.debug("Unable to find worker for job %s", unique)
  2727. return
  2728. try:
  2729. job_worker.stop(reason)
  2730. except Exception:
  2731. log.exception("Exception sending stop command to worker:")