From 7f9e1017b08c8011bcab874b1e1b73d30be67895 Mon Sep 17 00:00:00 2001 From: Gabriele Cerami Date: Tue, 13 Nov 2018 11:53:48 +0000 Subject: [PATCH] Collect logs: Handle errors and timeout currently if the collect logs script has error or is timing out, none of the copy tasks are run, and we may end up with no logs at all even on the log collection itself. This patch encloses the log collection within a block to handle errors, and runs the script with a specified timeout, to always leave some time for copying what logs we have from the node to the executor Change-Id: I428bd0aa8e35a2a94f2cf3039dd9e3ae683334a6 --- playbooks/tripleo-ci/post.yaml | 71 +++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/playbooks/tripleo-ci/post.yaml b/playbooks/tripleo-ci/post.yaml index e0a3ec48e..14b9752a4 100644 --- a/playbooks/tripleo-ci/post.yaml +++ b/playbooks/tripleo-ci/post.yaml @@ -8,6 +8,10 @@ - name: Collect logs hosts: primary tasks: + - name: set collection timeout + set_fact: + collect_timeout_sec: "{{ zuul.post_timeout|default(3600) - copy_logs_time|default(300) }}" + - name: Copy zuul_console_json log to workspace for reproducer copy: content: "{{ hostvars['localhost'].zuul_console_json }}" @@ -36,29 +40,50 @@ fi when: environment_type != "ovb" or not undercloud_logs.stat.exists + - name: Check script existence + stat: + path: "{{ ansible_user_dir }}/workspace/logs/collect_logs.sh" + register: collect_logs_path -- hosts: - - primary - - centos-7 - tasks: + - name: Collect logs with a timeout + block: + - name: Run ansible playbook to collect logs + shell: | + timeout --preserve-status -s 15 \ + -k {{ [collect_timeout_sec|int, 60]|sum|string }} {{ collect_timeout_sec|string }} \ + bash {{ ansible_user_dir }}/workspace/logs/collect_logs.sh + when: collect_logs_path.stat.exists + register: collect_logs_run - - name: Ensure artifacts directory exists - file: - path: '{{ zuul.executor.work_root }}/artifacts' - state: directory - delegate_to: localhost + rescue: + - name: warn when collect logs timed out (SIGTERM or SIGKILL used) + debug: + msg: "ERROR: Collect logs timed out" + when: collect_logs_path.stat.exists and (collect_logs_run.rc == 143 or collect_logs_run.rc == 137) - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - no_log: true - failed_when: false - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs + - name: warn when collect logs failed + debug: + msg: "ERROR: Collect logs failed, please check the logs" + when: collect_logs_path.stat.exists and collect_logs_run.rc != 143 and collect_logs_run.rc != 137 + + always: + - name: Ensure artifacts directory exists + file: + path: '{{ zuul.executor.work_root }}/artifacts' + state: directory + delegate_to: localhost + + - name: Copy files from {{ ansible_user_dir }}/workspace/ on node + no_log: true + failed_when: false + synchronize: + src: '{{ ansible_user_dir }}/workspace/' + dest: '{{ zuul.executor.log_root }}' + mode: pull + copy_links: true + verify_host: true + rsync_opts: + - --include=/logs/** + - --include=*/ + - --exclude=* + - --prune-empty-dirs