diff --git a/tools/upstream-wheel-audit.py b/tools/upstream-wheel-audit.py new file mode 100644 index 0000000000..aff350c8d0 --- /dev/null +++ b/tools/upstream-wheel-audit.py @@ -0,0 +1,85 @@ +# Check which of the wheels in our AFS directory exist upstream +# +# This outputs two files +# +# to-delete.txt : a list of files and directories that can be removed +# from the mirror as all contents are cached in pypi +# +# log.txt : the leading number is the number of files left +# in the given directory after checking upstream +# package contents. i.e. this is unique content in +# our mirror volume. +# +# Needs pypi-simple + +import sys +import os +import json + +from pypi_simple import PyPISimple, NoSuchProjectError + +BASE = '/afs/openstack.org/mirror/wheel' + +FILE_DEL = open('to-delete.txt', 'w') +FILE_LOG = open('log.txt', 'w') + +PLATFORMS = ('centos-8-x86_64', + 'centos-9-x86_64', + 'debian-10-x86_64' + 'debian-11-x86_64', + 'ubuntu-18.04-aarch64', + 'ubuntu-20.04-aarch64', + 'ubuntu-22.04-aarch64', + 'centos-8-aarch64', + 'centos-9-aarch64', + 'debian-10-aarch64', + 'debian-11-aarch64', + 'ubuntu-16.04-x86_64', + 'ubuntu-18.04-x86_64', + 'ubuntu-20.04-x86_64', + 'ubuntu-22.04-x86_64') + +def iterate_wheels(path, d): + name = os.path.basename(path) + + if os.path.isdir(path): + if name not in d['dirs']: + d['dirs'][name] = {'dirs':{},'files':[]} + for x in os.listdir(path): + iterate_wheels(os.path.join(path,x), d['dirs'][name]) + + # top level has index.html; skip + # otherwise the directory name is the pypi project name + if name and (d['dirs'][name]['files']): + with PyPISimple() as client: + try: + page = client.get_project_page(name) + except NoSuchProjectError: + print("Removing disappeared project : %s" % name, file=sys.stderr) + for w in d['dirs'][name]['files']: + print("%s/%s" % (path, w), file=FILE_DEL) + return + + upstream = set([package.filename for package in page.packages]) + local = set(d['dirs'][name]['files']) + + not_upstream = local.difference(upstream) + dups = local.intersection(upstream) + + # Print files to delete, and if the directory is empty + # put that in the list to delete too. + for d in dups: + print("%s/%s" % (path, d), file=FILE_DEL) + if len(not_upstream) == 0: + print("%s" % path, file=FILE_DEL) + + # Output the file left in the directory after pruning + print("%4d %s" % (len(not_upstream), path), file=FILE_LOG) + else: + d['files'].append(name) + return d + +for p in PLATFORMS: + print("Processing %s" % p, file=sys.stderr) + iterate_wheels('%s/%s/' % (BASE, p), + d = {'dirs':{},'files':[]})