From 2ed0fd09123f12e21e53a66d2f598c14a0226146 Mon Sep 17 00:00:00 2001 From: Daniel Pawlik Date: Fri, 25 Feb 2022 09:16:48 +0100 Subject: [PATCH] Add option to download logs to the directory The new option '--download' give possibility to download the log files with additional Zuul information files, that later can be used by other tool to send the content to next log processing service. The option was proposed in other patch set [1], which is large and it makes review more complex. [1] https://review.opendev.org/c/openstack/ci-log-processing/+/830337 Change-Id: I88d39c8ca2e186f68296e0f21551eac2c550b5b8 --- ansible/roles/check-services/tasks/main.yml | 19 ++- ansible/roles/logscraper/README.rst | 4 +- ansible/roles/logscraper/defaults/main.yml | 8 +- ansible/roles/logscraper/tasks/service.yml | 9 ++ .../logscraper/templates/logscraper.sh.j2 | 10 ++ doc/source/logscraper.rst | 45 +++++++ logscraper/logscraper.py | 116 ++++++++++++++---- logscraper/tests/test_logscraper.py | 76 ++++++++++-- 8 files changed, 244 insertions(+), 43 deletions(-) diff --git a/ansible/roles/check-services/tasks/main.yml b/ansible/roles/check-services/tasks/main.yml index c5a0e82..e0ef7ef 100644 --- a/ansible/roles/check-services/tasks/main.yml +++ b/ansible/roles/check-services/tasks/main.yml @@ -7,11 +7,19 @@ ### BUILD CONTAINER IMAGES ### - name: Build container images block: - - name: Build logscraper container image + - name: Build logscraper container image - Zuul shell: > podman build -t quay.io/logscraper:dev -f Dockerfile args: chdir: "{{ zuul.projects['opendev.org/openstack/ci-log-processing'].src_dir }}" + when: zuul is defined + + - name: Build logscraper container image - non Zuul + shell: > + podman build -t quay.io/logscraper:dev -f Dockerfile + args: + chdir: "{{ playbook_dir }}" + when: zuul is not defined - name: Get logscraper image id shell: | @@ -23,6 +31,14 @@ podman build -t quay.io/loggearman:dev -f loggearman/Dockerfile args: chdir: "{{ zuul.projects['opendev.org/openstack/ci-log-processing'].src_dir }}" + when: zuul is defined + + - name: Build loggearman container image - non Zuul + shell: > + podman build -t quay.io/loggearman:dev -f loggearman/Dockerfile + args: + chdir: "{{ playbook_dir }}" + when: zuul is not defined - name: Get loggearman image id shell: | @@ -38,7 +54,6 @@ container_images: logscraper: "{{ _logscraper_image_id.stdout }}" loggearman: "{{ _loggearman_image_id.stdout }}" - when: zuul is defined ### OPENSEARCH #### - name: Setup Opensearch diff --git a/ansible/roles/logscraper/README.rst b/ansible/roles/logscraper/README.rst index 821facc..d5f353b 100644 --- a/ansible/roles/logscraper/README.rst +++ b/ansible/roles/logscraper/README.rst @@ -60,10 +60,10 @@ and second one for getting logs from `sometenant` tenant. zuul_api_url: https://zuul.opendev.org/api/tenant/openstack insecure: False - tenant: sometenant - gearman_port: 4731 - gearman_server: someproject.org zuul_api_url: https://zuul.opendev.org/api/tenant/sometenant insecure: True + download: true + download_dir: /mnt/logscraper roles: - logscraper diff --git a/ansible/roles/logscraper/defaults/main.yml b/ansible/roles/logscraper/defaults/main.yml index 7a34351..1610ad4 100644 --- a/ansible/roles/logscraper/defaults/main.yml +++ b/ansible/roles/logscraper/defaults/main.yml @@ -15,10 +15,10 @@ container_images: # gearman_port: 4731 # gearman_server: logstash.openstack.org # zuul_api_url: https://zuul.opendev.org/api/tenant/openstack -# insecure: False +# insecure: false # - tenant: sometenant -# gearman_port: 4731 -# gearman_server: logstash.openstack.org # zuul_api_url: https://zuul.opendev.org/api/tenant/sometenant -# insecure: True +# insecure: true +# download: true +# download_dir: /mnt/logscraper/sometenant tenant_builds: [] diff --git a/ansible/roles/logscraper/tasks/service.yml b/ansible/roles/logscraper/tasks/service.yml index 03a2528..43ccb8c 100644 --- a/ansible/roles/logscraper/tasks/service.yml +++ b/ansible/roles/logscraper/tasks/service.yml @@ -1,4 +1,13 @@ --- +- name: Create logscraper download directory + file: + path: "{{ item.download_dir }}" + state: directory + recurse: true + owner: "{{ logscraper_user }}" + group: "{{ logscraper_group }}" + when: "'download_dir' in item" + - name: Generate logscraper script template: src: logscraper.sh.j2 diff --git a/ansible/roles/logscraper/templates/logscraper.sh.j2 b/ansible/roles/logscraper/templates/logscraper.sh.j2 index f416504..187eea1 100644 --- a/ansible/roles/logscraper/templates/logscraper.sh.j2 +++ b/ansible/roles/logscraper/templates/logscraper.sh.j2 @@ -8,9 +8,19 @@ --uidmap 1000:{{ logscraper_uid }}:1 \ --name logscraper-{{ item.tenant }} \ --volume {{ logscraper_dir }}:{{ logscraper_dir }}:z \ + {% if 'download_dir' in item %} + --volume {{ item.download_dir }}:{{ item.download_dir }}:z \ + {% endif %} {{ container_images['logscraper'] }} \ + {% if 'gearman_port' in item and 'gearman_server' in item %} --gearman-port {{ item.gearman_port }} \ --gearman-server {{ item.gearman_server }} \ + {% else %} + --download \ + {% endif %} + {% if 'download_dir' in item %} + --directory {{ item.download_dir }} \ + {% endif %} --checkpoint-file {{ item.checkpoint_file | default(logscraper_dir + '/checkpoint') }} \ --follow \ --zuul-api-url {{ item.zuul_api_url }} diff --git a/doc/source/logscraper.rst b/doc/source/logscraper.rst index dc98f86..c96e6ab 100644 --- a/doc/source/logscraper.rst +++ b/doc/source/logscraper.rst @@ -11,6 +11,38 @@ It is available by typing: logscraper --help + Fetch and push last Zuul CI job logs into gearman. + + optional arguments: + -h, --help show this help message and exit + --zuul-api-url ZUUL_API_URL + URL(s) for Zuul API. Parameter can be set multiple + times. + --job-name JOB_NAME CI job name(s). Parameter can be set multiple times. + If not set it would scrape every latest builds. + --gearman-server GEARMAN_SERVER + Gearman host addresss + --gearman-port GEARMAN_PORT + Gearman listen port. Defaults to 4730. + --follow Keep polling zuul builds + --insecure Skip validating SSL cert + --checkpoint-file CHECKPOINT_FILE + File that will keep information about last uuid + timestamp for a job. + --logstash-url LOGSTASH_URL + When provided, script will check connection to + Logstash service before sending to log processing + system. For example: logstash.local:9999 + --workers WORKERS Worker processes for logscraper + --max-skipped MAX_SKIPPED + How many job results should be checked until last uuid + written in checkpoint file is founded + --debug Print more information + --download Download logs and do not send to gearman service + --directory DIRECTORY + Directory, where the logs will be stored. Defaults to: + /tmp/logscraper + Basic usage ----------- @@ -43,6 +75,12 @@ Example: logscraper --gearman-server localhost --job-name tripleo-ci-centos-8-containers-multinode --job-name openstack-tox-linters --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --zuul-api-url https://zuul.opendev.org/api/tenant/local +* download logs to /mnt/logscraper. NOTE: if you are using container service, this directory needs to be mounted! + +.. code-block:: + + logscraper --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --directory /mnt/logscraper --download + Containerize tool ----------------- @@ -62,3 +100,10 @@ to the container, for example: .. code-block:: docker run -v $(pwd):/checkpoint-dir:z -d logscraper logscraper --gearman-server somehost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /checkpoint-dir/checkpoint.txt --follow + + +In this example, logscraper will download log files to the /mnt/logscraper directory: + +.. code-block:: + + docker run -v $(pwd):/checkpoint-dir:z -v /mnt/logscraper:/mnt/logscraper:z -d logscraper logscraper --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /checkpoint-dir/checkpoint.txt --directory /mnt/logscraper --download --follow diff --git a/logscraper/logscraper.py b/logscraper/logscraper.py index 436b2b8..5510ca1 100755 --- a/logscraper/logscraper.py +++ b/logscraper/logscraper.py @@ -23,18 +23,21 @@ The goal is to push recent zuul builds into log gearman processor. import argparse import gear +import itertools import json import logging import multiprocessing +import os import requests import socket import sys import time -import urllib import yaml +from concurrent.futures import ThreadPoolExecutor from distutils.version import StrictVersion as s_version import tenacity +from urllib.parse import urljoin file_to_check = [ @@ -46,6 +49,8 @@ file_to_check = [ "var/log/extra/logstash.txt.gz", "var/log/extra/errors.txt", "var/log/extra/errors.txt.gz", + "zuul-info/inventory.yaml", + "zuul-info/inventory.yaml.gz" ] # From: https://opendev.org/opendev/base-jobs/src/branch/master/roles/submit-logstash-jobs/defaults/main.yaml # noqa @@ -165,8 +170,7 @@ def get_arguments(): "set multiple times. If not set it would scrape " "every latest builds.", action='append') - parser.add_argument("--gearman-server", help="Gearman host addresss", - required=True) + parser.add_argument("--gearman-server", help="Gearman host addresss") parser.add_argument("--gearman-port", help="Gearman listen port. " "Defaults to 4730.", default=4730) @@ -181,6 +185,7 @@ def get_arguments(): "to log processing system. For example: " "logstash.local:9999") parser.add_argument("--workers", help="Worker processes for logscraper", + type=int, default=1) parser.add_argument("--max-skipped", help="How many job results should be " "checked until last uuid written in checkpoint file " @@ -188,6 +193,12 @@ def get_arguments(): default=500) parser.add_argument("--debug", help="Print more information", action="store_true") + parser.add_argument("--download", help="Download logs and do not send " + "to gearman service", + action="store_true") + parser.add_argument("--directory", help="Directory, where the logs will " + "be stored. Defaults to: /tmp/logscraper", + default="/tmp/logscraper") args = parser.parse_args() return args @@ -292,7 +303,7 @@ class LogMatcher(object): fields["build_newrev"] = result.get("newrev", "UNKNOWN") fields["node_provider"] = "local" - log_url = urllib.parse.urljoin(result["log_url"], filename) + log_url = urljoin(result["log_url"], filename) fields["log_url"] = log_url fields["tenant"] = result["tenant"] @@ -401,17 +412,45 @@ def get_last_job_results(zuul_url, insecure, max_skipped, last_uuid, ############################################################################### # Log scraper # ############################################################################### -def check_specified_files(job_result, insecure): - """Return list of specified files if they exists on logserver. """ - available_files = [] - for f in file_to_check: - if not job_result["log_url"]: - continue - response = requests_get("%s%s" % (job_result["log_url"], f), - insecure) +def save_build_info(directory, build): + with open("%s/buildinfo" % directory, "w") as text_file: + yaml.dump(build, text_file) + + +def download_file(url, directory, insecure=False): + logging.debug("Started fetching %s" % url) + filename = url.split("/")[-1] + try: + response = requests.get(url, verify=insecure, stream=True) if response.status_code == 200: - available_files.append(f) - return available_files + if directory: + with open("%s/%s" % (directory, filename), 'wb') as f: + for txt in response.iter_content(1024): + f.write(txt) + return filename + except requests.exceptions.ContentDecodingError: + logging.critical("Can not decode content from %s" % url) + + +def check_specified_files(job_result, insecure, directory=None): + """Return list of specified files if they exists on logserver. """ + + args = job_result.get("build_args") + if not job_result["log_url"]: + logging.debug("There is no file to download for build " + "uuid: %s" % job_result["uuid"]) + return + + build_log_urls = [urljoin(job_result["log_url"], s) for s in file_to_check] + + results = [] + pool = ThreadPoolExecutor(max_workers=args.workers) + for page in pool.map(download_file, build_log_urls, + itertools.repeat(directory), + itertools.repeat(insecure)): + if page: + results.append(page) + return results def setup_logging(debug): @@ -425,7 +464,12 @@ def setup_logging(debug): def run_build(build): """Submit job informations into log processing system. """ - args = build.pop("build_args") + args = build.get("build_args") + + # NOTE: if build result is "ABORTED", there is no any + # job result files to parse. Skipping that file. + if build["result"].lower() == 'aborted': + return logging.info( "Processing logs for %s | %s | %s | %s", @@ -435,18 +479,36 @@ def run_build(build): build["uuid"], ) - results = dict(files=[], jobs=[], invocation={}) + if args.download: + logging.debug("Started fetching build logs") + directory = "%s/%s" % (args.directory, build["uuid"]) + try: + if not os.path.exists(directory): + os.makedirs(directory) + except PermissionError: + logging.critical("Can not create directory %s" % directory) + except Exception as e: + logging.critical("Exception occured %s on creating dir %s" % ( + e, directory)) - lmc = LogMatcher( - args.gearman_server, - args.gearman_port, - build["result"], - build["log_url"], - {}, - ) - results["files"] = check_specified_files(build, args.insecure) + check_specified_files(build, args.insecure, directory) + save_build_info(directory, build) + else: + logging.debug("Parsing content for gearman service") + results = dict(files=[], jobs=[], invocation={}) + files = check_specified_files(build, args.insecure) + if not files: + return + results["files"] = files + lmc = LogMatcher( + args.gearman_server, + args.gearman_port, + build["result"], + build["log_url"], + {}, + ) - lmc.submitJobs("push-log", results["files"], build) + lmc.submitJobs("push-log", results["files"], build) def check_connection(logstash_url): @@ -512,6 +574,10 @@ def run(args): def main(): args = get_arguments() setup_logging(args.debug) + if args.download and args.gearman_server and args.gearman_port: + logging.critical("Can not use logscraper to send logs to gearman " + "and dowload logs. Choose one") + sys.exit(1) while True: run(args) if not args.follow: diff --git a/logscraper/tests/test_logscraper.py b/logscraper/tests/test_logscraper.py index 104ef6a..5a71b53 100644 --- a/logscraper/tests/test_logscraper.py +++ b/logscraper/tests/test_logscraper.py @@ -103,12 +103,24 @@ builds_result = [{ }] +class _MockedPoolMapResult: + def __init__(self, func, iterable): + self.func = func + self.iterable = iterable + + # mocked results + self._value = [self.func(i) for i in iterable] + + def get(self, timeout=0): + return self._value + + class FakeArgs(object): def __init__(self, zuul_api_url=None, gearman_server=None, gearman_port=None, follow=False, insecure=False, checkpoint_file=None, ignore_checkpoint=None, logstash_url=None, workers=None, max_skipped=None, - job_name=None): + job_name=None, download=None, directory=None): self.zuul_api_url = zuul_api_url self.gearman_server = gearman_server @@ -121,6 +133,8 @@ class FakeArgs(object): self.workers = workers self.max_skipped = max_skipped self.job_name = job_name + self.download = download + self.directory = directory class TestScraper(base.TestCase): @@ -183,7 +197,7 @@ class TestScraper(base.TestCase): logstash_url='localhost:9999') args = logscraper.get_arguments() logscraper.check_connection(args.logstash_url) - mock_socket.assert_called_once() + self.assertTrue(mock_socket.called) @mock.patch('socket.socket') def test_check_connection_wrong_host(self, mock_socket): @@ -216,7 +230,8 @@ class TestScraper(base.TestCase): 'someuuid', None) self.assertRaises(ValueError, make_fake_list, job_result) - @mock.patch('multiprocessing.pool.Pool.map') + @mock.patch('logscraper.logscraper.save_build_info') + @mock.patch('logscraper.logscraper.check_specified_files') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('os.path.isfile') @mock.patch('logscraper.logscraper.check_specified_files', @@ -229,15 +244,24 @@ class TestScraper(base.TestCase): gearman_port=4731, workers=1)) def test_run_scraping(self, mock_args, mock_submit, mock_files, - mock_isfile, mock_readfile, mock_map): + mock_isfile, mock_readfile, mock_specified_files, + mock_save_buildinfo): with mock.patch('logscraper.logscraper.get_last_job_results' ) as mock_job_results: - args = logscraper.get_arguments() - mock_job_results.return_value = [builds_result[0]] - logscraper.run_scraping(args, - 'http://somehost.com/api/tenant/tenant1') - self.assertEqual(builds_result[0], mock_map.call_args.args[1][0]) - self.assertIn("build_args", mock_map.call_args.args[1][0]) + with mock.patch('multiprocessing.pool.Pool.map', + lambda self, func, iterable, chunksize=None, + callback=None, error_callback=None: + _MockedPoolMapResult(func, iterable)): + args = logscraper.get_arguments() + mock_job_results.return_value = [builds_result[0]] + logscraper.run_scraping( + args, 'http://somehost.com/api/tenant/tenant1') + self.assertEqual(builds_result[0]['uuid'], + mock_submit.call_args.args[2]['uuid']) + self.assertTrue(mock_submit.called) + self.assertEqual(builds_result[0], + mock_specified_files.call_args.args[0]) + self.assertFalse(mock_save_buildinfo.called) @mock.patch('logscraper.logscraper.run_scraping') def test_run(self, mock_scraping): @@ -251,6 +275,38 @@ class TestScraper(base.TestCase): logscraper.run(args) self.assertEqual(3, mock_scraping.call_count) + @mock.patch('logscraper.logscraper.save_build_info') + @mock.patch('logscraper.logscraper.check_specified_files') + @mock.patch('builtins.open', new_callable=mock.mock_open()) + @mock.patch('os.path.isfile') + @mock.patch('logscraper.logscraper.check_specified_files', + return_value=['job-output.txt']) + @mock.patch('logscraper.logscraper.LogMatcher.submitJobs') + @mock.patch('argparse.ArgumentParser.parse_args', + return_value=FakeArgs( + zuul_api_url=['http://somehost.com/api/tenant/tenant1'], + workers=1, download=True, directory="/tmp/testdir")) + def test_run_scraping_download(self, mock_args, mock_submit, mock_files, + mock_isfile, mock_readfile, + mock_specified_files, mock_save_buildinfo): + with mock.patch('logscraper.logscraper.get_last_job_results' + ) as mock_job_results: + with mock.patch( + 'multiprocessing.pool.Pool.map', + lambda self, func, iterable, chunksize=None, callback=None, + error_callback=None: _MockedPoolMapResult(func, iterable), + ): + args = logscraper.get_arguments() + mock_job_results.return_value = [builds_result[0]] + logscraper.run_scraping( + args, 'http://somehost.com/api/tenant/tenant1') + + self.assertFalse(mock_submit.called) + self.assertTrue(mock_specified_files.called) + self.assertEqual(builds_result[0], + mock_specified_files.call_args.args[0]) + self.assertTrue(mock_save_buildinfo.called) + class TestConfig(base.TestCase): @mock.patch('sys.exit')