Merge "Add BuildCache checkpoint to avoid missing long running build"

2022-03-17 15:20:11 +00:00
parent 2fb7befb4c 6c292222f9
commit a80f2415b4
5 changed files with 208 additions and 45 deletions
--- a/README.rst
+++ b/README.rst
@@ -98,7 +98,7 @@ To download logs:

   logscraper \
    --zuul-api-url https://zuul.opendev.org/api/tenant/openstack \
-    --checkpoint-file /tmp/results-checkpoint.txt \
+    --checkpoint-file /tmp/results-checkpoint \
    --worker 8 \
    --max-skipped 100 \
    --download \
--- a/doc/source/logscraper.rst
+++ b/doc/source/logscraper.rst
@@ -55,19 +55,19 @@ Example:

 .. code-block::

-  logscraper --gearman-server somehost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /tmp/results-checkpoint.txt --follow
+  logscraper --gearman-server somehost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /tmp/results-checkpoint --follow

 * one shot on getting logs from `zuul` tenant:

 .. code-block::

-  logscraper --gearman-server localhost --zuul-api-url https://zuul.opendev.org/api/tenant/zuul --checkpoint-file /tmp/zuul-result-timestamp.txt
+  logscraper --gearman-server localhost --zuul-api-url https://zuul.opendev.org/api/tenant/zuul --checkpoint-file /tmp/zuul-result-timestamp

 * periodically scrape logs from tenants: `openstack`, `zuul` and `local`

 .. code-block::

-  logscraper --gearman-server localhost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --zuul-api-url https://zuul.opendev.org/api/tenant/zuul --zuul-api-url https://zuul.opendev.org/api/tenant/local --checkpoint-file /tmp/someresults.txt --follow
+  logscraper --gearman-server localhost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --zuul-api-url https://zuul.opendev.org/api/tenant/zuul --zuul-api-url https://zuul.opendev.org/api/tenant/local --checkpoint-file /tmp/someresults --follow

 * scrape logs from two defined job names: `tripleo-ci-centos-8-containers-multinode` and `openstack-tox-linters` for tenants: `openstack` and `local`:

@@ -99,11 +99,11 @@ to the container, for example:

 .. code-block::

-   docker run -v $(pwd):/checkpoint-dir:z -d logscraper logscraper --gearman-server somehost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /checkpoint-dir/checkpoint.txt --follow
+   docker run -v $(pwd):/checkpoint-dir:z -d logscraper logscraper --gearman-server somehost --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /checkpoint-dir/checkpoint --follow


 In this example, logscraper will download log files to the /mnt/logscraper directory:

 .. code-block::

-   docker run -v $(pwd):/checkpoint-dir:z -v /mnt/logscraper:/mnt/logscraper:z -d logscraper logscraper --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /checkpoint-dir/checkpoint.txt --directory /mnt/logscraper --download --follow
+   docker run -v $(pwd):/checkpoint-dir:z -v /mnt/logscraper:/mnt/logscraper:z -d logscraper logscraper --zuul-api-url https://zuul.opendev.org/api/tenant/openstack --checkpoint-file /checkpoint-dir/checkpoint --directory /mnt/logscraper --download --follow
--- a/logscraper/logscraper.py
+++ b/logscraper/logscraper.py
@@ -18,10 +18,48 @@
 The goal is to push recent zuul builds into log gearman processor.

 [ CLI ] -> [ Config ] -> [ ZuulFetcher ] -> [ LogPublisher ]
+
+
+# Zuul builds results are not sorted by end time. Here is a problematic
+scenario:
+
+Neutron01 build starts at 00:00
+Many smolXX build starts and stops at 01:00
+Neutron01 build stops at 02:00
+
+
+When at 01:55 we query the /builds:
+
+- smol42   ends at 01:54
+- smol41   ends at 01:50
+- smol40   ends at 01:49
+- ...
+
+
+When at 02:05 we query the /builds:
+
+- smol42   ends at 01:54    # already in build cache, skip
+- smol41   ends at 01:50    # already in build cache, skip
+- smol40   ends at 01:49    # already in build cache, skip
+- ...
+- neutron01  ends at 02:00  # not in build cache, get_last_job_results yield
+
+Question: when to stop the builds query?
+
+We could check that all the _id value got processed, but that can be tricky
+when long running build are interleaved with short one. For example, the
+scrapper could keep track of the oldest _id and ensure it got them all.
+
+But Instead, we'll always grab the last 1000 builds and process new builds.
+This is not ideal, because we might miss builds if more than 1000 builds
+happens between two query.
+But that will have todo until the zuul builds api can return builds sorted by
+end_time.
 """


 import argparse
+import datetime
 import gear
 import itertools
 import json
@@ -30,6 +68,7 @@ import multiprocessing
 import os
 import requests
 import socket
+import sqlite3
 import sys
 import time
 import yaml
@@ -190,7 +229,7 @@ def get_arguments():
    parser.add_argument("--max-skipped", help="How many job results should be "
                        "checked until last uuid written in checkpoint file "
                        "is founded",
-                        default=500)
+                        default=1000)
    parser.add_argument("--debug", help="Print more information",
                        action="store_true")
    parser.add_argument("--download", help="Download logs and do not send "
@@ -208,7 +247,6 @@ def get_arguments():
 ###############################################################################
 class Config:
    def __init__(self, args, zuul_api_url, job_name=None):
-        self.checkpoint = None
        url_path = zuul_api_url.split("/")
        if url_path[-3] != "api" and url_path[-2] != "tenant":
            print(
@@ -223,18 +261,74 @@ class Config:
        if job_name:
            self.filename = "%s-%s" % (self.filename, job_name)

-        try:
-            with open(self.filename) as f:
-                self.checkpoint = f.readline()
-        except Exception:
-            logging.exception("Can't load the checkpoint. Creating file")
+        self.build_cache = BuildCache(self.filename)

-    def save(self, job_uuid):
+    def save(self):
        try:
-            with open(self.filename, 'w') as f:
-                f.write(job_uuid)
+            self.build_cache.save()
        except Exception as e:
-            raise("Can not write status to the checkpoint file %s" % e)
+            logging.critical("Can not write status to the build_cache "
+                             "file %s" % e)
+
+
+class BuildCache:
+    def __init__(self, filepath=None):
+        self.builds = dict()
+
+        if not filepath:
+            logging.critical("No cache file provided. Can not continue")
+            sys.exit(1)
+
+        self.create_db(filepath)
+        self.create_table()
+
+        # clean builds that are older than 1 day
+        self.clean()
+
+        rows = self.fetch_data()
+        if rows:
+            for r in rows:
+                uid, date = r
+                self.builds[uid] = date
+
+    def create_db(self, filepath):
+        try:
+            self.connection = sqlite3.connect(filepath)
+            self.cursor = self.connection.cursor()
+        except Exception as e:
+            logging.critical("Can not create cache DB! Error %s" % e)
+
+    def create_table(self):
+        try:
+            self.cursor.execute("CREATE TABLE IF NOT EXISTS logscraper ("
+                                "uid INTEGER, timestamp INTEGER)")
+        except sqlite3.OperationalError:
+            logging.debug("The logscraper table already exists")
+
+    def fetch_data(self):
+        try:
+            return self.cursor.execute(
+                "SELECT uid, timestamp FROM logscraper").fetchall()
+        except Exception as e:
+            logging.exception("Can't get data from cache file! Error %s" % e)
+
+    def add(self, uid):
+        self.builds[uid] = int(datetime.datetime.now().timestamp())
+
+    def clean(self):
+        # Remove old builds
+        yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
+        self.cursor.execute("DELETE FROM logscraper WHERE timestamp < %s" %
+                            yesterday.timestamp())
+        self.connection.commit()
+
+    def save(self):
+        self.cursor.executemany('INSERT INTO logscraper VALUES (?,?)',
+                                list(self.builds.items()))
+        self.connection.commit()
+
+    def contains(self, uid):
+        return uid in self.builds


 ###############################################################################
@@ -396,17 +490,18 @@ def filter_available_jobs(zuul_api_url, job_names, insecure):
    return filtered_jobs


-def get_last_job_results(zuul_url, insecure, max_skipped, last_uuid,
+def get_last_job_results(zuul_url, insecure, max_builds, build_cache,
                         job_name):
    """Yield builds until we find the last uuid."""
    count = 0
    for build in get_builds(zuul_url, insecure, job_name):
-        if count > int(max_skipped):
-            break
-        if build["uuid"] == last_uuid:
-            break
-        yield build
        count += 1
+        if count > int(max_builds):
+            break
+        if build_cache.contains(build["_id"]):
+            continue
+        build_cache.add(build["_id"])
+        yield build


 ###############################################################################
@@ -551,14 +646,14 @@ def check_connection(logstash_url):
 def run_scraping(args, zuul_api_url, job_name=None):
    """Get latest job results and push them into log processing service.

-    On the end, write newest uuid into checkpoint file, so in the future
-    script will not push log duplication.
+    On the end, write build_cache file, so in the future
+    script will not push duplicate build.
    """
    config = Config(args, zuul_api_url, job_name)

    builds = []
    for build in get_last_job_results(zuul_api_url, args.insecure,
-                                      args.max_skipped, config.checkpoint,
+                                      args.max_skipped, config.build_cache,
                                      job_name):
        logging.debug("Working on build %s" % build['uuid'])
        # add missing informations
@@ -578,7 +673,7 @@ def run_scraping(args, zuul_api_url, job_name=None):
        try:
            pool.map(run_build, builds)
        finally:
-            config.save(builds[0]['uuid'])
+            config.save()


 def run(args):
--- a/logscraper/tests/test_logscraper.py
+++ b/logscraper/tests/test_logscraper.py
@@ -14,7 +14,9 @@
 # License for the specific language governing permissions and limitations
 # under the License.

+import datetime
 import json
+import tempfile

 from logscraper import logscraper
 from logscraper.tests import base
@@ -236,12 +238,19 @@ class TestScraper(base.TestCase):
                              args.logstash_url)

    @mock.patch('logscraper.logscraper.get_builds',
-                return_value=iter([{'uuid': '1234'}]))
-    def test_get_last_job_results(self, mock_get_builds):
+                return_value=iter([{'_id': '1234'}]))
+    @mock.patch('argparse.ArgumentParser.parse_args')
+    def test_get_last_job_results(self, mock_args, mock_get_builds):
+        mock_args.return_value = FakeArgs(
+            zuul_api_url='http://somehost.com/api/tenant/sometenant',
+            gearman_server='localhost',
+            checkpoint_file='/tmp/testfile')
+        args = logscraper.get_arguments()
+        some_config = logscraper.Config(args, args.zuul_api_url)
        job_result = logscraper.get_last_job_results(
            'http://somehost.com/api/tenant/tenant1', False, '1234',
-            'someuuid', None)
-        self.assertEqual([{'uuid': '1234'}], list(job_result))
+            some_config.build_cache, None)
+        self.assertEqual([{'_id': '1234'}], list(job_result))
        self.assertEqual(1, mock_get_builds.call_count)

    @mock.patch('logscraper.logscraper.get_builds',
@@ -397,7 +406,7 @@ class TestScraper(base.TestCase):

 class TestConfig(base.TestCase):
    @mock.patch('sys.exit')
-    def test_save(self, mock_sys):
+    def test_config_object(self, mock_sys):
        # Assume that url is wrong so it raise IndexError
        with mock.patch('argparse.ArgumentParser.parse_args') as mock_args:
            mock_args.return_value = FakeArgs(
@@ -415,19 +424,20 @@ class TestConfig(base.TestCase):
            logscraper.Config(args, args.zuul_api_url)
            mock_sys.assert_called()

+    @mock.patch('logscraper.logscraper.BuildCache.save')
+    @mock.patch('logscraper.logscraper.BuildCache.clean')
+    @mock.patch('argparse.ArgumentParser.parse_args')
+    def test_save(self, mock_args, mock_clean, mock_save):
        # correct url without job name
-        with mock.patch('argparse.ArgumentParser.parse_args') as mock_args:
-            mock_args.return_value = FakeArgs(
-                zuul_api_url='http://somehost.com/api/tenant/sometenant',
-                gearman_server='localhost',
-                checkpoint_file='/tmp/testfile')
-            args = logscraper.get_arguments()
-            with mock.patch('builtins.open',
-                            new_callable=mock.mock_open()
-                            ) as mock_file:
-                some_config = logscraper.Config(args, args.zuul_api_url)
-                some_config.save('123412312341234')
-                mock_file.assert_called_with('/tmp/testfile-sometenant', 'w')
+        mock_args.return_value = FakeArgs(
+            zuul_api_url='http://somehost.com/api/tenant/sometenant',
+            gearman_server='localhost',
+            checkpoint_file='/tmp/testfile')
+        args = logscraper.get_arguments()
+        some_config = logscraper.Config(args, args.zuul_api_url)
+        some_config.save()
+        mock_clean.assert_called_once()
+        mock_save.assert_called_once()


 class TestLogMatcher(base.TestCase):
@@ -526,3 +536,61 @@ class TestLogMatcher(base.TestCase):
                expected_gear_job,
                json.loads(mock_gear_job.call_args.args[1].decode('utf-8'))
            )
+
+
+class TestBuildCache(base.TestCase):
+
+    @mock.patch('sqlite3.connect', return_value=mock.MagicMock())
+    def test_create_db(self, mock_connect):
+        filename = '/tmp/somefile'
+        logscraper.BuildCache(filename)
+        mock_connect.assert_called_with(filename)
+        mock_connect.return_value.cursor.assert_called_once()
+
+    @mock.patch('sqlite3.connect')
+    def test_create_table(self, mock_connect):
+        tmp_dir = tempfile.mkdtemp()
+        filename = '%s/testfile' % tmp_dir
+        logscraper.BuildCache(filename)
+        mock_execute = mock_connect.return_value.cursor.return_value.execute
+        mock_execute.assert_called()
+        self.assertEqual('CREATE TABLE IF NOT EXISTS logscraper (uid INTEGER, '
+                         'timestamp INTEGER)',
+                         mock_execute.call_args_list[0].args[0])
+
+    @mock.patch('sqlite3.connect')
+    def test_fetch_data(self, mock_connect):
+        tmp_dir = tempfile.mkdtemp()
+        filename = '%s/testfile' % tmp_dir
+        logscraper.BuildCache(filename)
+        mock_execute = mock_connect.return_value.cursor.return_value.execute
+        mock_execute.assert_called()
+        self.assertEqual('SELECT uid, timestamp FROM logscraper',
+                         mock_execute.call_args_list[2].args[0])
+
+    def test_clean(self):
+        # add old data
+        tmp_dir = tempfile.mkdtemp()
+        filename = '%s/testfile' % tmp_dir
+        cache = logscraper.BuildCache(filename)
+        current_build = {'ffeeddccbbaa': datetime.datetime.now().timestamp()}
+        cache.builds['aabbccddeeff'] = 1647131633
+        cache.builds.update(current_build)
+        cache.save()
+        # check cleanup
+        cache = logscraper.BuildCache(filename)
+        cache.clean()
+        self.assertEqual(current_build, cache.builds)
+
+    @mock.patch('sqlite3.connect')
+    def test_save(self, mock_connect):
+        tmp_dir = tempfile.mkdtemp()
+        filename = '%s/testfile' % tmp_dir
+        cache = logscraper.BuildCache(filename)
+        cache.builds = {'ffeeddccbbaa': datetime.datetime.now().timestamp()}
+        cache.save()
+        mock_many = mock_connect.return_value.cursor.return_value.executemany
+        mock_many.assert_called()
+        expected_call = ('INSERT INTO logscraper VALUES (?,?)',
+                         list(cache.builds.items()))
+        self.assertEqual(expected_call, mock_many.call_args_list[0].args)
--- a/logscraper/tests/test_logsender.py
+++ b/logscraper/tests/test_logsender.py
@@ -28,7 +28,7 @@ buildinfo = """
 _id: 17428524
 branch: master
 build_args:
-  checkpoint_file: /tmp/results-checkpoint.txt
+  checkpoint_file: /tmp/results-checkpoint
  debug: false
  directory: /tmp/logscraper
  download: true