system-config/tools/upstream-wheel-audit.py
Ian Wienand d1548e5049
tools/upstream-wheel-audit.py
This is a tool to tell us which of our on-disk wheels are duplicated
upstream by PyPI.  These are things we don't need to cache locally.

At one time, we were downloading all dependencies of our requirements
and caching them; we shouldn't be doing that any more, but anything
reported by this tool can be removed from our local mirrors.

Now that the number of platforms * number of branches is becoming a
maintence issue, this will help us foucs on keeping a useful working
set in the cache.

Change-Id: I3ded6b9869598a0907d7cda9f03bf414e46885df
2023-04-03 14:25:25 +10:00

86 lines
2.9 KiB
Python

# Check which of the wheels in our AFS directory exist upstream
#
# This outputs two files
#
# to-delete.txt : a list of files and directories that can be removed
# from the mirror as all contents are cached in pypi
#
# log.txt : the leading number is the number of files left
# in the given directory after checking upstream
# package contents. i.e. this is unique content in
# our mirror volume.
#
# Needs pypi-simple
import sys
import os
import json
from pypi_simple import PyPISimple, NoSuchProjectError
BASE = '/afs/openstack.org/mirror/wheel'
FILE_DEL = open('to-delete.txt', 'w')
FILE_LOG = open('log.txt', 'w')
PLATFORMS = ('centos-8-x86_64',
'centos-9-x86_64',
'debian-10-x86_64'
'debian-11-x86_64',
'ubuntu-18.04-aarch64',
'ubuntu-20.04-aarch64',
'ubuntu-22.04-aarch64',
'centos-8-aarch64',
'centos-9-aarch64',
'debian-10-aarch64',
'debian-11-aarch64',
'ubuntu-16.04-x86_64',
'ubuntu-18.04-x86_64',
'ubuntu-20.04-x86_64',
'ubuntu-22.04-x86_64')
def iterate_wheels(path, d):
name = os.path.basename(path)
if os.path.isdir(path):
if name not in d['dirs']:
d['dirs'][name] = {'dirs':{},'files':[]}
for x in os.listdir(path):
iterate_wheels(os.path.join(path,x), d['dirs'][name])
# top level has index.html; skip
# otherwise the directory name is the pypi project name
if name and (d['dirs'][name]['files']):
with PyPISimple() as client:
try:
page = client.get_project_page(name)
except NoSuchProjectError:
print("Removing disappeared project : %s" % name, file=sys.stderr)
for w in d['dirs'][name]['files']:
print("%s/%s" % (path, w), file=FILE_DEL)
return
upstream = set([package.filename for package in page.packages])
local = set(d['dirs'][name]['files'])
not_upstream = local.difference(upstream)
dups = local.intersection(upstream)
# Print files to delete, and if the directory is empty
# put that in the list to delete too.
for d in dups:
print("%s/%s" % (path, d), file=FILE_DEL)
if len(not_upstream) == 0:
print("%s" % path, file=FILE_DEL)
# Output the file left in the directory after pruning
print("%4d %s" % (len(not_upstream), path), file=FILE_LOG)
else:
d['files'].append(name)
return d
for p in PLATFORMS:
print("Processing %s" % p, file=sys.stderr)
iterate_wheels('%s/%s/' % (BASE, p),
d = {'dirs':{},'files':[]})