zuul-jobs/roles/upload-logs-base/library/zuul_ibm_upload.py

370 lines
12 KiB
Python
Executable File

# Copyright 2014 Rackspace Australia
# Copyright 2018-2019 Red Hat, Inc
# Copyright 2021-2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Make coding more python3-ish
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
"""
Utility to upload files to IBM Cloud
Run this from the CLI from the zuul-jobs/roles directory with:
python -m upload-logs-base.library.zuul_ibm_upload
"""
import argparse
import logging
import os
try:
import queue as queuelib
except ImportError:
import Queue as queuelib
import sys
import threading
from ibm_botocore.client import Config
import ibm_boto3
import ibm_boto3.s3.transfer
from ansible.module_utils.basic import AnsibleModule
try:
# Ansible context
from ansible.module_utils.zuul_jobs.upload_utils import (
FileList,
GZIPCompressedStream,
Indexer,
retry_function,
)
except ImportError:
# Test context
from ..module_utils.zuul_jobs.upload_utils import (
FileList,
GZIPCompressedStream,
Indexer,
retry_function,
)
MAX_UPLOAD_THREADS = 24
class Uploader():
def __init__(self, client, bucket, prefix=None, public=True,
dry_run=False, endpoint_url=None,
bucket_location=None):
self.dry_run = dry_run
self.public = public
if dry_run:
self.url = 'https://example.com/a/path/'
return
self.client = client
self.prefix = prefix or ''
self.bucket = bucket
self.url = os.path.join(endpoint_url,
bucket, self.prefix)
try:
self._set_cors(bucket)
except self.client.exceptions.NoSuchBucket:
if not bucket_location:
raise Exception("Bucket location must be specified")
if public:
acl = 'public-read'
else:
acl = 'private'
self.client.create_bucket(
ACL=acl,
Bucket=bucket,
CreateBucketConfiguration={
'LocationConstraint': bucket_location
}
)
self._set_cors(bucket)
def _set_cors(self, bucket):
self.client.put_bucket_cors(
Bucket=bucket,
CORSConfiguration={
'CORSRules': [{
'AllowedMethods': [
'GET',
'HEAD',
],
'AllowedOrigins': [
'*',
],
}],
},
)
def upload(self, file_list):
"""Spin up thread pool to upload to storage"""
if self.dry_run:
return
num_threads = min(len(file_list), MAX_UPLOAD_THREADS)
threads = []
queue = queuelib.Queue()
# add items to queue
for f in file_list:
queue.put(f)
for x in range(num_threads):
t = threading.Thread(target=self.post_thread, args=(queue,))
threads.append(t)
t.start()
for t in threads:
t.join()
def post_thread(self, queue):
while True:
try:
file_detail = queue.get_nowait()
logging.debug("%s: processing job %s",
threading.current_thread(),
file_detail)
retry_function(lambda: self._post_file(file_detail))
except IOError:
# Do our best to attempt to upload all the files
logging.exception("Error opening file")
continue
except queuelib.Empty:
# No more work to do
return
@staticmethod
def _is_text_type(mimetype):
# We want to compress all text types.
if mimetype.startswith('text/'):
return True
# Further compress types that typically contain text but are no
# text sub type.
compress_types = [
'application/json',
'image/svg+xml',
]
if mimetype in compress_types:
return True
return False
def _post_file(self, file_detail):
relative_path = os.path.join(self.prefix, file_detail.relative_path)
content_encoding = None
if file_detail.folder:
# We don't need to upload folders to IBM
return
if (file_detail.encoding is None and
self._is_text_type(file_detail.mimetype)):
content_encoding = 'gzip'
data = GZIPCompressedStream(open(file_detail.full_path, 'rb'))
else:
if (not file_detail.filename.endswith(".gz") and
file_detail.encoding):
# Don't apply gzip encoding to files that we receive as
# already gzipped. The reason for this is storage will
# serve this back to users as an uncompressed file if they
# don't set an accept-encoding that includes gzip. This
# can cause problems when the desired file state is
# compressed as with .tar.gz tarballs.
content_encoding = file_detail.encoding
data = open(file_detail.full_path, 'rb')
extra_args = dict(
ContentType=file_detail.mimetype,
)
if content_encoding:
extra_args['ContentEncoding'] = content_encoding
if self.public:
extra_args['ACL'] = 'public-read'
self.client.upload_fileobj(
data,
self.bucket,
relative_path,
ExtraArgs=extra_args
)
def run(bucket, files,
indexes=True, parent_links=True, topdir_parent_link=False,
partition=False, footer='index_footer.html',
prefix=None, public=True, dry_run=False, api_key=None,
instance_id=None, endpoint_url=None, bucket_location=None):
client = ibm_boto3.client(
"s3",
ibm_api_key_id=api_key,
ibm_service_instance_id=instance_id,
config=Config(signature_version="oauth"),
endpoint_url=endpoint_url,
)
if prefix:
prefix = prefix.lstrip('/')
if partition and prefix:
parts = prefix.split('/')
if len(parts) > 1:
bucket += '_' + parts[0]
prefix = '/'.join(parts[1:])
# Create the objects to make sure the arguments are sound.
with FileList() as file_list:
# Scan the files.
for file_path in files:
file_list.add(file_path)
indexer = Indexer(file_list)
# (Possibly) make indexes.
if indexes:
indexer.make_indexes(create_parent_links=parent_links,
create_topdir_parent_link=topdir_parent_link,
append_footer=footer)
logging.debug("List of files prepared to upload:")
for x in file_list:
logging.debug(x)
# Upload.
uploader = Uploader(client, bucket, prefix, public, dry_run,
endpoint_url, bucket_location)
uploader.upload(file_list)
return uploader.url
def ansible_main():
module = AnsibleModule(
argument_spec=dict(
bucket=dict(required=True, type='str'),
files=dict(required=True, type='list'),
partition=dict(type='bool', default=False),
indexes=dict(type='bool', default=True),
parent_links=dict(type='bool', default=True),
topdir_parent_link=dict(type='bool', default=False),
public=dict(type='bool', default=True),
footer=dict(type='str'),
prefix=dict(type='str'),
api_key=dict(type='str'),
instance_id=dict(type='str'),
endpoint_url=dict(type='str'),
bucket_location=dict(type='str'),
)
)
p = module.params
url = run(p.get('bucket'), p.get('files'),
indexes=p.get('indexes'),
parent_links=p.get('parent_links'),
topdir_parent_link=p.get('topdir_parent_link'),
partition=p.get('partition'),
footer=p.get('footer'),
prefix=p.get('prefix'),
public=p.get('public'),
api_key=p.get('api_key'),
instance_id=p.get('instance_id'),
endpoint_url=p.get('endpoint_url'),
bucket_location=p.get('bucket_location'))
module.exit_json(changed=True,
url=url)
def cli_main():
parser = argparse.ArgumentParser(
description="Upload files to IBM Cloud Storage"
)
parser.add_argument('--verbose', action='store_true',
help='show debug information')
parser.add_argument('--no-indexes', action='store_true',
help='do not generate any indexes at all')
parser.add_argument('--no-parent-links', action='store_true',
help='do not include links back to a parent dir')
parser.add_argument('--create-topdir-parent-link', action='store_true',
help='include a link in the root directory of the '
'files to the parent directory which may be the '
'index of all results')
parser.add_argument('--no-public', action='store_true',
help='do not create the bucket as public')
parser.add_argument('--partition', action='store_true',
help='partition the prefix into multiple buckets')
parser.add_argument('--append-footer', default='index_footer.html',
help='when generating an index, if the given file is '
'present in a directory, append it to the index '
'(set to "none" to disable)')
parser.add_argument('--prefix',
help='Prepend this path to the object names when '
'uploading')
parser.add_argument('--dry-run', action='store_true',
help='do not attempt to create buckets or upload, '
'useful with --verbose for debugging')
parser.add_argument('--api-key',
help='An IBM Cloud API key')
parser.add_argument('--instance-id',
help='An IBM Cloud Object Storage instance ID')
parser.add_argument('--endpoint-url',
help='An IBM Cloud Object Storage endpoint URL')
parser.add_argument('--bucket-location',
help='The location constraint for the bucket')
parser.add_argument('bucket',
help='Name of the bucket to use when uploading')
parser.add_argument('files', nargs='+',
help='the file(s) to upload with recursive glob '
'matching when supplied as a string')
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logging.captureWarnings(True)
append_footer = args.append_footer
if append_footer.lower() == 'none':
append_footer = None
url = run(args.bucket, args.files,
indexes=not args.no_indexes,
parent_links=not args.no_parent_links,
topdir_parent_link=args.create_topdir_parent_link,
partition=args.partition,
footer=append_footer,
prefix=args.prefix,
public=not args.no_public,
dry_run=args.dry_run,
api_key=args.api_key,
instance_id=args.instance_id,
endpoint_url=args.endpoint_url,
bucket_location=args.bucket_location)
print(url)
if __name__ == '__main__':
# The zip/ansible/modules check is required for Ansible 5 because
# stdin may be a tty, but does not work in ansible 2.8. The tty
# check works on versions 2.8, 2.9, and 6.
if ('.zip/ansible/modules' in sys.argv[0] or not sys.stdin.isatty()):
ansible_main()
else:
cli_main()