zuul-jobs/roles/upload-logs-base/library/zuul_ibm_upload.py

#!/usr/bin/env python3
#
# Copyright 2014 Rackspace Australia
# Copyright 2018-2019 Red Hat, Inc
# Copyright 2021-2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Make coding more python3-ish
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type


"""
Utility to upload files to IBM Cloud

Run this from the CLI from the zuul-jobs/roles directory with:

  python -m upload-logs-base.library.zuul_ibm_upload
"""

import argparse
import logging
import os
try:
    import queue as queuelib
except ImportError:
    import Queue as queuelib
import sys
import threading

from ibm_botocore.client import Config
import ibm_boto3
import ibm_boto3.s3.transfer

from ansible.module_utils.basic import AnsibleModule

try:
    # Ansible context
    from ansible.module_utils.zuul_jobs.upload_utils import (
        FileList,
        GZIPCompressedStream,
        Indexer,
        retry_function,
    )
except ImportError:
    # Test context
    from ..module_utils.zuul_jobs.upload_utils import (
        FileList,
        GZIPCompressedStream,
        Indexer,
        retry_function,
    )

MAX_UPLOAD_THREADS = 24


class Uploader():
    def __init__(self, client, bucket, prefix=None, public=True,
                 dry_run=False, endpoint_url=None,
                 bucket_location=None):
        self.dry_run = dry_run
        self.public = public
        if dry_run:
            self.url = 'https://example.com/a/path/'
            return

        self.client = client
        self.prefix = prefix or ''
        self.bucket = bucket

        self.url = os.path.join(endpoint_url,
                                bucket, self.prefix)

        try:
            self._set_cors(bucket)
        except self.client.exceptions.NoSuchBucket:
            if not bucket_location:
                raise Exception("Bucket location must be specified")
            if public:
                acl = 'public-read'
            else:
                acl = 'private'
            self.client.create_bucket(
                ACL=acl,
                Bucket=bucket,
                CreateBucketConfiguration={
                    'LocationConstraint': bucket_location
                }
            )
            self._set_cors(bucket)

    def _set_cors(self, bucket):
        self.client.put_bucket_cors(
            Bucket=bucket,
            CORSConfiguration={
                'CORSRules': [{
                    'AllowedMethods': [
                        'GET',
                        'HEAD',
                    ],
                    'AllowedOrigins': [
                        '*',
                    ],
                }],
            },
        )

    def upload(self, file_list):
        """Spin up thread pool to upload to storage"""

        if self.dry_run:
            return

        num_threads = min(len(file_list), MAX_UPLOAD_THREADS)
        threads = []
        queue = queuelib.Queue()
        # add items to queue
        for f in file_list:
            queue.put(f)

        for x in range(num_threads):
            t = threading.Thread(target=self.post_thread, args=(queue,))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

    def post_thread(self, queue):
        while True:
            try:
                file_detail = queue.get_nowait()
                logging.debug("%s: processing job %s",
                              threading.current_thread(),
                              file_detail)
                retry_function(lambda: self._post_file(file_detail))
            except IOError:
                # Do our best to attempt to upload all the files
                logging.exception("Error opening file")
                continue
            except queuelib.Empty:
                # No more work to do
                return

    @staticmethod
    def _is_text_type(mimetype):
        # We want to compress all text types.
        if mimetype.startswith('text/'):
            return True

        # Further compress types that typically contain text but are no
        # text sub type.
        compress_types = [
            'application/json',
            'image/svg+xml',
        ]
        if mimetype in compress_types:
            return True
        return False

    def _post_file(self, file_detail):
        relative_path = os.path.join(self.prefix, file_detail.relative_path)
        content_encoding = None

        if file_detail.folder:
            # We don't need to upload folders to IBM
            return

        if (file_detail.encoding is None and
            self._is_text_type(file_detail.mimetype)):
            content_encoding = 'gzip'
            data = GZIPCompressedStream(open(file_detail.full_path, 'rb'))
        else:
            if (not file_detail.filename.endswith(".gz") and
                file_detail.encoding):
                # Don't apply gzip encoding to files that we receive as
                # already gzipped. The reason for this is storage will
                # serve this back to users as an uncompressed file if they
                # don't set an accept-encoding that includes gzip. This
                # can cause problems when the desired file state is
                # compressed as with .tar.gz tarballs.
                content_encoding = file_detail.encoding
            data = open(file_detail.full_path, 'rb')

        extra_args = dict(
            ContentType=file_detail.mimetype,
        )
        if content_encoding:
            extra_args['ContentEncoding'] = content_encoding

        if self.public:
            extra_args['ACL'] = 'public-read'

        self.client.upload_fileobj(
            data,
            self.bucket,
            relative_path,
            ExtraArgs=extra_args
        )


def run(bucket, files,
        indexes=True, parent_links=True, topdir_parent_link=False,
        partition=False, footer='index_footer.html',
        prefix=None, public=True, dry_run=False, api_key=None,
        instance_id=None, endpoint_url=None, bucket_location=None):

    client = ibm_boto3.client(
        "s3",
        ibm_api_key_id=api_key,
        ibm_service_instance_id=instance_id,
        config=Config(signature_version="oauth"),
        endpoint_url=endpoint_url,
    )

    if prefix:
        prefix = prefix.lstrip('/')
    if partition and prefix:
        parts = prefix.split('/')
        if len(parts) > 1:
            bucket += '_' + parts[0]
            prefix = '/'.join(parts[1:])

    # Create the objects to make sure the arguments are sound.
    with FileList() as file_list:
        # Scan the files.
        for file_path in files:
            file_list.add(file_path)

        indexer = Indexer(file_list)

        # (Possibly) make indexes.
        if indexes:
            indexer.make_indexes(create_parent_links=parent_links,
                                 create_topdir_parent_link=topdir_parent_link,
                                 append_footer=footer)

        logging.debug("List of files prepared to upload:")
        for x in file_list:
            logging.debug(x)

        # Upload.
        uploader = Uploader(client, bucket, prefix, public, dry_run,
                            endpoint_url, bucket_location)
        uploader.upload(file_list)
        return uploader.url


def ansible_main():
    module = AnsibleModule(
        argument_spec=dict(
            bucket=dict(required=True, type='str'),
            files=dict(required=True, type='list'),
            partition=dict(type='bool', default=False),
            indexes=dict(type='bool', default=True),
            parent_links=dict(type='bool', default=True),
            topdir_parent_link=dict(type='bool', default=False),
            public=dict(type='bool', default=True),
            footer=dict(type='str'),
            prefix=dict(type='str'),
            api_key=dict(type='str'),
            instance_id=dict(type='str'),
            endpoint_url=dict(type='str'),
            bucket_location=dict(type='str'),
        )
    )

    p = module.params
    url = run(p.get('bucket'), p.get('files'),
              indexes=p.get('indexes'),
              parent_links=p.get('parent_links'),
              topdir_parent_link=p.get('topdir_parent_link'),
              partition=p.get('partition'),
              footer=p.get('footer'),
              prefix=p.get('prefix'),
              public=p.get('public'),
              api_key=p.get('api_key'),
              instance_id=p.get('instance_id'),
              endpoint_url=p.get('endpoint_url'),
              bucket_location=p.get('bucket_location'))
    module.exit_json(changed=True,
                     url=url)


def cli_main():
    parser = argparse.ArgumentParser(
        description="Upload files to IBM Cloud Storage"
    )
    parser.add_argument('--verbose', action='store_true',
                        help='show debug information')
    parser.add_argument('--no-indexes', action='store_true',
                        help='do not generate any indexes at all')
    parser.add_argument('--no-parent-links', action='store_true',
                        help='do not include links back to a parent dir')
    parser.add_argument('--create-topdir-parent-link', action='store_true',
                        help='include a link in the root directory of the '
                             'files to the parent directory which may be the '
                             'index of all results')
    parser.add_argument('--no-public', action='store_true',
                        help='do not create the bucket as public')
    parser.add_argument('--partition', action='store_true',
                        help='partition the prefix into multiple buckets')
    parser.add_argument('--append-footer', default='index_footer.html',
                        help='when generating an index, if the given file is '
                             'present in a directory, append it to the index '
                             '(set to "none" to disable)')
    parser.add_argument('--prefix',
                        help='Prepend this path to the object names when '
                             'uploading')
    parser.add_argument('--dry-run', action='store_true',
                        help='do not attempt to create buckets or upload, '
                             'useful with --verbose for debugging')
    parser.add_argument('--api-key',
                        help='An IBM Cloud API key')
    parser.add_argument('--instance-id',
                        help='An IBM Cloud Object Storage instance ID')
    parser.add_argument('--endpoint-url',
                        help='An IBM Cloud Object Storage endpoint URL')
    parser.add_argument('--bucket-location',
                        help='The location constraint for the bucket')
    parser.add_argument('bucket',
                        help='Name of the bucket to use when uploading')
    parser.add_argument('files', nargs='+',
                        help='the file(s) to upload with recursive glob '
                        'matching when supplied as a string')

    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
        logging.captureWarnings(True)

    append_footer = args.append_footer
    if append_footer.lower() == 'none':
        append_footer = None

    url = run(args.bucket, args.files,
              indexes=not args.no_indexes,
              parent_links=not args.no_parent_links,
              topdir_parent_link=args.create_topdir_parent_link,
              partition=args.partition,
              footer=append_footer,
              prefix=args.prefix,
              public=not args.no_public,
              dry_run=args.dry_run,
              api_key=args.api_key,
              instance_id=args.instance_id,
              endpoint_url=args.endpoint_url,
              bucket_location=args.bucket_location)
    print(url)


if __name__ == '__main__':
    if sys.stdin.isatty():
        cli_main()
    else:
        ansible_main()