copy-wheels : generate indexes version 2

Some things have become evident when generating the indexes requiring some larger changes. Firstly, the indexer script needs python3 on the host. Since we're still building CentOS 7 wheels, we need to install Python 3 from EPEL there. Secondly, because part of the PEP503 index page is the file hash, reading all the files back over AFS is quite slow. It's also quite slow having ansible loop a task each time, which all adds up to job timeouts. Instead, make the indexes on the local disk before we copy the results to AFS. This requires copying both scripts to the host for execution (rather than relying on "script:" ) so the wheel-copy.sh script can call wheel-indexer.py. While we are there, a small refactor on the wheel-indexer.py to use os.walk() (which makes it easier to have this as a stand-alone recursive script later, if something changes). Also update the output to use <ul><li> for the filenames, so it looks a little better on the output html. Change-Id: I85f9e132bc55fd8d33583a698e15c47665e5cf8d
2020-01-15 07:43:42 +11:00 · 2020-01-15 07:43:42 +11:00 · 3e9efa3a65
parent dc3289235e
commit 3e9efa3a65
5 changed files with 95 additions and 60 deletions
--- a/roles/copy-wheels/files/wheel-copy.sh
+++ b/roles/copy-wheels/files/wheel-copy.sh
@ -22,6 +22,10 @@ for f in $WHEELHOUSE_DIR/*; do

    DEST_DIR="${PACKAGENAME:0:1}/$PACKAGENAME"

+    # Create the index file
+    # NOTE(ianw) : remove temporary "--output" when working
+    /usr/local/bin/wheel-indexer.py --debug --output "index.html.tmp" $f
+
    # Create the mirror directories in AFS /s/split style. This
    # depends on the existence of a mod_rewrite script which unmunges
    # the path, and is required because AFS has a practical folder size
--- a/roles/copy-wheels/files/wheel-indexer.py
+++ b/roles/copy-wheels/files/wheel-indexer.py
@ -16,22 +16,24 @@
 # under the License.
 #

-# glob all .whl files in a directory, and make a index.html page
+# Final all .whl files in a directory, and make a index.html page
 # in PEP503 (https://www.python.org/dev/peps/pep-0503/) format

 import argparse
 import datetime
 import email
-import glob
 import hashlib
 import html
 import logging
+import os
 import sys
 import zipfile

 parser = argparse.ArgumentParser()
-parser.add_argument('outfile', nargs='?', default='-', help="output filename")
+parser.add_argument('toplevel', help="directory to index")
 parser.add_argument('-d', '--debug', dest="debug", action='store_true')
+parser.add_argument('-o', '--output', dest="output",
+                    default='index.html', help="Output filename, - for stdout")
 args = parser.parse_args()

 level = logging.DEBUG if args.debug else logging.INFO
@ -92,56 +94,74 @@ def get_sha256(filename):
    return(sha256.hexdigest())


-output = '''<html>
+def create_index(path, files):
+
+    project = os.path.basename(path)
+
+    output = f'''<html>
  <head>
-    <title>Links</title>
+    <title>{project}</title>
  </head>
  <body>
+   <ul>
 '''

-files = glob.glob('*.whl')
-for f in files:
+    for f in files:
+        f_full = os.path.join(path, f)
+        requirements = ''
+        try:
+            logging.debug("Checking for requirements of : %s" % f_full)
+            requirements = get_requirements(f_full)
+            logging.debug("requirements are: %s" % requirements)
+        # NOTE(ianw): i'm not really sure if any of these should be
+        # terminal, as it would mean pip can't read the file anyway.  Just
+        # log for now.
+        except NoMetadataException:
+            logging.debug("no metadata")
+            pass
+        except NoRequirementsException:
+            logging.debug("no python requirements")
+            pass
+        except BadFormatException:
+            logging.debug("Could not open")
+            pass

-    requirements = ''
-    try:
-        logging.debug("Checking for requirements of : %s" % f)
-        requirements = get_requirements(f)
-        logging.debug("requirements are: %s" % requirements)
-    # NOTE(ianw): i'm not really sure if any of these should be
-    # terminal, as it would mean pip can't read the file anyway.  Just
-    # log for now.
-    except NoMetadataException:
-        logging.debug("no metadata")
-        pass
-    except NoRequirementsException:
-        logging.debug("no python requirements")
-        pass
-    except BadFormatException:
-        logging.debug("Could not open")
-        pass
+        sha256 = get_sha256(f_full)
+        logging.debug("sha256 for %s: %s" % (f_full, sha256))

-    sha256 = get_sha256(f)
-    logging.debug("sha256 for %s: %s" % (f, sha256))
+        output += f'      <li><a href="{f}#sha256={sha256}"'
+        if requirements:
+            output += f' data-requires-python="{requirements}" '
+        output += f'>{f}</a></li>\n'

-    output += f'    <a href="{f}#sha256={sha256}"'
-    if requirements:
-        output += f' data-requires-python="{requirements}" '
-    output += f'>{f}</a>\n'
-
-output += '''  </body>
+    output += '''   </ul>
+   </body>
 </html>
 '''
-now = datetime.datetime.now()
-output += '<!-- last update: %s -->\n' % now.isoformat()
+    now = datetime.datetime.now()
+    output += '<!-- last update: %s -->\n' % now.isoformat()

-logging.debug("Final output write")
+    return output

-if args.outfile == '-':
-    outfile = sys.stdout
-else:
-    outfile = open(args.outfile, "w")
-    logging.debug("Output going to: %s" % args.outfile)

-outfile.write(output)
+for root, dirs, files in os.walk(args.toplevel):
+    # sanity check we are only called from leaf directories by the
+    # driver script
+    if dirs:
+        print("This should only be called from leaf directories")
+        sys.exit(1)

-logging.debug("Done!")
+    logging.debug("Processing %s" % root)
+
+    output = create_index(root, files)
+
+    logging.debug("Final output write")
+    if args.output == '-':
+        out_file = sys.stdout
+    else:
+        out_path = os.path.join(root, args.output)
+        logging.debug("Writing index file: %s" % out_path)
+        out_file = open(out_path, "w")
+
+    out_file.write(output)
+    logging.debug("Done!")
--- a/roles/copy-wheels/tasks/main.yaml
+++ b/roles/copy-wheels/tasks/main.yaml
@ -1,5 +1,24 @@
+- name: Ensure we have python3 for indexer
+  include_tasks: "{{ lookup('first_found', params) }}"
+  vars:
+    params:
+      files:
+        - "py3.{{ ansible_distribution }}.{{ ansible_distribution_major_version }}.yaml"
+        - "default.yaml"
+
+- name: Put copy scripts on host
+  copy:
+    src: '{{ item }}'
+    dest: '/usr/local/bin/{{ item }}'
+    owner: root
+    group: root
+    mode: '0755'
+  loop:
+    - wheel-copy.sh
+    - wheel-indexer.py
+
 - name: Copy the wheels to AFS
-  script: wheel-copy.sh {{ wheel_dir }} {{ afs_dir }}
+  command: '/usr/local/bin/wheel-copy.sh {{ wheel_dir }} {{ afs_dir }}'

 - name: Rebuild top-level mirror index
  script: wheel-index.sh {{ afs_dir }}
@ -7,20 +26,3 @@
  # the final index.  All hosts should be finished copying under
  # linear strategy.
  run_once: True
-
- name: Get project directories
-  # the directories are laid out a/ b/ c/ ... z/ with projects
-  # underneath (ergo */* match).  We actually use mod_rewrite to paper
-  # over this in the mirror apache config for external users.
-  shell: 'ls -d {{ afs_dir }}/*/*'
-  register: directories
-
- name: Create individual project indexes
-  # NOTE(ianw) .test to be removed after testing
-  script: wheel-indexer.py --debug index.html.test
-  args:
-    chdir: '{{ item }}'
-    executable: 'python3'
-  loop: "{{ directories.stdout.split('\n') }}"
-  # NOTE(ianw) remove after testing
-  ignore_errors: true
--- a/roles/copy-wheels/tasks/py3.CentOS.7.yaml
+++ b/roles/copy-wheels/tasks/py3.CentOS.7.yaml
@ -0,0 +1,5 @@
+- name: Install Python3
+  yum:
+    package: python3
+    enablerepo: epel
+    state: present
--- a/roles/copy-wheels/tasks/py3.default.yaml
+++ b/roles/copy-wheels/tasks/py3.default.yaml
@ -0,0 +1,4 @@
+- name: Install Python3
+  package:
+    name: python3
+    state: present