Merge "Add a script to list change owner statistics"

2016-08-30 16:02:32 +00:00
parent 44fcd8e8f0 791769fbba
commit 71016bb569
1 changed files with 641 additions and 0 deletions
--- a/tools/owners.py
+++ b/tools/owners.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2016 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS
+# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+# Description: When run using OpenStack's Gerrit server, this builds
+# YAML representations of aggregate change owner details and change
+# counts for each governance project-team, as well as a combined set
+# for all teams.
+
+# Rationale: The OpenStack Technical Committee and Project Team Lead
+# elections need electorate rolls taken from "Active Technical
+# Contributors" to any repos under official project-teams over a
+# particular timeframe. Similarly, the OpenStack Foundation gives
+# summit registration discount codes to contributors meeting similar
+# criteria. The Gerrit REST API provides access to all the data
+# necessary to identify these individuals.
+
+# Use: The results end up in files named for each
+# official governance project-team (or "all") ending with a .yaml
+# extension. At the time of writing, it takes approximately 30
+# minutes to run on a well-connected machine with 70-80ms round-trip
+# latency to review.openstack.org.
+
+# An example for generating the March 2016 technical election rolls:
+#
+#     $ virtualenv venv
+#     [...]
+#     $ ./venv/bin/pip install pyyaml requests
+#     [...]
+#     $ ./venv/bin/python tools/owners.py -a 2015-03-04 \
+#           -b 2016-03-04 -i 11131 -o owners -r march-2016-elections
+#     MISSING: ansible-build-image
+#     MERGING DUPLICATE ACCOUNT: 8074 into 2467
+#     [...blah, blah, blah...wait for completion...]
+#
+# TODO(fungi): Add a pass which will correctly generate the
+# stable_branch_maintenance.* files. In the meantime, to properly
+# generate the SBM PTL electorate, run a second time with a
+# different -o of sbm, adding the -n and -s options, and then copy
+# the full electorate over like:
+#
+#     $ ./venv/bin/python tools/owners.py -a 2015-03-04 \
+#           -b 2016-03-04 -i 11131 -o sbm -r march-2016-elections \
+#           -n -s 'branch:^stable/.*'
+#     [...wait for completion again...]
+#     $ cp sbm/_electorate.txt owners/stable_branch_maintenance.txt
+#     $ cp sbm/_all_owners.yaml owners/stable_branch_maintenance.yaml
+#
+# Once complete, make a compressed tarball of the owners directory
+# and send it attached to a PGP/MIME signed message to the appointed
+# election officials. The various *.txt files are lists of the
+# preferred addresses of all valid voters for the various PTL
+# elections (whose team names correspond to the file names),
+# suitable for passing directly to CIVS. The similarly named *.yaml
+# files are detailed structured data about the same sets of voters,
+# for use in validating the address lists. The _electorate.txt file
+# is the equivalent address list for the TC election voters, and its
+# corresponding structured data is in _all_owners.yaml.
+
+# You can also do interesting analysis on _all_owners.yaml, for
+# example:
+#
+#     $ ./venv/bin/python
+#     >>> import yaml
+#     >>>
+#     >>> o = yaml.load(open('owners/_all_owners.yaml'))
+#     >>> for c in range(5):
+#     ...     print('Owners of at least %s changes: %s' % (
+#     ...         c+1,
+#     ...         len({k: v for k, v in o.iteritems() if v['count'] > c})))
+#     ...
+#     Owners of at least 1 changes: 3239
+#     Owners of at least 2 changes: 2352
+#     Owners of at least 3 changes: 1924
+#     Owners of at least 4 changes: 1682
+#     Owners of at least 5 changes: 1504
+
+
+from __future__ import print_function
+import argparse
+import csv
+import datetime
+import json
+import os
+import string
+import sys
+
+import requests
+import yaml
+
+
+def dumper(data, stream):
+    """Convenience wrapper to consistently set YAML formatting"""
+    return yaml.safe_dump(data, allow_unicode=True, default_flow_style=False,
+                          encoding='utf-8', stream=stream)
+
+
+def normalize_email(email):
+    """Lower-case the domain part of E-mail addresses to better spot
+    duplicate entries, since the domain part is case-insensitive
+    courtesy of DNS while the local part is not necessarily"""
+    local, domain = email.split('@')
+    domain = domain.lower()
+    return '%s@%s' % (local, domain)
+
+
+def normalize_project(project):
+    """Replace spaces and hyphens with underscores in project teams
+    and then lower-case them, for more convenient filenames"""
+    return project.translate(string.maketrans(' -', '__')).lower()
+
+
+def date_merged(change, after=None, before=None):
+    """Determine the date and time a specific change merged"""
+
+    date = change['submitted']
+
+    if not date:
+        # Something's terribly wrong with any changes matching this now
+        print(
+            'SKIPPING DATELESS MERGE: change %s for account %s'
+            % (change['_number'], change['owner']['_account_id']),
+            file=sys.stderr)
+        return None
+
+    # Strip superfluous subsecond values as Gerrit always just
+    # reports .000000000 for them anyway
+    date = date.split('.')[0]
+
+    # Pass back an invalid result if it falls after the requested
+    # cutoff
+    if before and date >= before:
+        return None
+
+    # Sanity check for completeness, but since "after" is also used
+    # in the Gerrit query this shouldn't ever actually be reached
+    if after and date < after:
+        return None
+
+    return date
+
+
+def query_gerrit(query):
+    """A requests wrapper to querying the Gerrit REST API"""
+    # TODO(fungi): this could be switched to call pygerrit instead
+
+    # The base URL to Gerrit, for accessing both GitWeb and the REST
+    # API
+    GERRIT_API_URL = 'https://review.openstack.org/'
+
+    # Retry http/https queries
+    retry = requests.Session()
+    retry.mount("http://", requests.adapters.HTTPAdapter(max_retries=3))
+    retry.mount("https://", requests.adapters.HTTPAdapter(max_retries=3))
+    raw = retry.get(GERRIT_API_URL + query,
+                    headers={'Accept': 'application/json'})
+
+    # Trap decoding failures and provide more detailed errors
+    try:
+        decoded = json.loads(raw.text[4:])
+    except:
+        print('\nrequest returned %s error to query:\n\n    %s\n'
+              '\nwith detail:\n\n    %s\n' % (raw, query, raw.text),
+              file=sys.stderr)
+        raise
+    return decoded
+
+
+def main():
+    """The giant pile of spaghetti which does everything else"""
+
+    # Record the start time for use later
+    start = datetime.datetime.utcnow()
+
+    # TODO(fungi): this could be trivially extracted to a separate
+    # function
+    parser = argparse.ArgumentParser(
+        description="When run using OpenStack's Gerrit server, this builds "
+        "YAML representations of aggregate change owner details and change "
+        "counts for each governance project-team, as well as a combined set "
+        "for all teams. Before and after dates/times should be supplied in "
+        "formats Gerrit accepts: https://review.openstack.org/Documentation/"
+        "user-search.html#search-operators")
+    parser.add_argument("-a", "--after", help="Start date for matching merges")
+    parser.add_argument("-b", "--before", help="End date for matching merges")
+    parser.add_argument("-c", "--config", help="Path to script configuration")
+    parser.add_argument("-i", "--ignore", help="Account Id numbers to skip",
+                        action='append')
+    parser.add_argument("-n", "--no-extra-atcs", help='Omit "extra ATCs"',
+                        dest='no_extra_atcs', action='store_true')
+    parser.add_argument("-o", "--outdir", help="Create an output directory")
+    parser.add_argument("-r", "--ref", help="Specify a Governance refname")
+    parser.add_argument("-s", "--sieve", help="Add Gerrit query parameters")
+    options = parser.parse_args()
+
+    # If we're supplied a configuration file, use it
+    if options.config:
+        config = yaml.safe_load(open(options.config))
+    # Otherwise, use nothing
+    else:
+        config = {}
+
+    # Start of the match timeframe for change merges
+    if options.after:
+        after = options.after
+    elif 'after' in config:
+        after = config['after']
+    else:
+        after = None
+
+    # End of the match timeframe for change merges
+    if options.before:
+        before = options.before
+    elif 'before' in config:
+        before = config['before']
+    else:
+        before = None
+
+    # Owner Ids for whom to ignore changes
+    if options.ignore:
+        ignore = [int(i) for i in options.ignore]
+    elif 'ignore' in config:
+        ignore = config['ignore']
+    else:
+        ignore = []
+
+    # Whether to omit "extra ATCs"
+    if options.no_extra_atcs:
+        no_extra_atcs = options.no_extra_atcs
+    elif 'no-extra-atcs' in config:
+        no_extra_atcs = config['no-extra-atcs']
+    else:
+        no_extra_atcs = False
+
+    # Output file directory
+    if options.outdir:
+        outdir = options.outdir
+    elif 'outdir' in config:
+        outdir = config['outdir']
+    else:
+        outdir = '.'
+    if not os.path.isdir(outdir):
+        os.makedirs(outdir)
+
+    # Governance Git repository ref object for reference lists
+    if options.ref:
+        ref = options.ref
+    elif 'ref' in config:
+        ref = config['ref']
+    else:
+        ref = 'refs/heads/master'
+
+    # Gerrit change query additions
+    if options.sieve:
+        sieve = options.sieve
+    elif 'sieve' in config:
+        sieve = config['sieve']
+    else:
+        sieve = None
+
+    # Path to the governance projects list, needs a Git refname as a
+    # parameter
+    PROJECTS_LIST = ('https://review.openstack.org/'
+                     'gitweb?p=openstack/governance.git;a=blob_plain;'
+                     'f=reference/projects.yaml;hb=%s')
+
+    # The query identifying relevant changes
+    match = 'status:merged'
+    if after:
+        match = '%s+after:"%s"' % (match, after)
+    if sieve:
+        match = '%s+%s' % (match, sieve)
+
+    # The set of projects from the governance repo
+    # TODO(fungi): make this a configurable option so that you can
+    # for example supply a custom project list for running elections
+    # in unofficial teams
+    gov_projects = requests.get(PROJECTS_LIST % ref)
+    gov_projects.encoding = 'utf-8'  # Workaround for Gitweb encoding
+    gov_projects = yaml.safe_load(gov_projects.text)
+
+    # A mapping of short (no prefix) to full repo names existing in
+    # Gerrit, used to handle repos which have a different namespace
+    # in governance during transitions and also to filter out repos
+    # listed in governance which don't actually exist
+    ger_repos = dict(
+        [(x.split('/')[-1], x) for x in query_gerrit('projects/')])
+
+    # This will be populated with change owners mapped to the
+    # project-teams maintaining their respective Git repositories
+    projects = {}
+
+    # This will be populated with all change owners and their
+    # account details
+    owners = {}
+
+    # This will be populated with discovered duplicate owners
+    duplicates = {}
+
+    # This will be populated with all individual E-mail addresses of
+    # change owners, to facilitate finding and merging duplicate
+    # accounts
+    all_emails = {}
+
+    # Iterate over all governance project-teams only at filename
+    # generation time
+    for project in gov_projects:
+        # This will be populated with change owner Ids and counts
+        projects[project] = {}
+
+        # Governance project-teams have one or more deliverables
+        for deliverable in gov_projects[project]['deliverables']:
+            # Each deliverable can have multiple repos
+            repos = gov_projects[project]['deliverables'][deliverable]['repos']
+
+            # Operate on repo short-names (no namespace) to avoid
+            # potential namespace mismatches between governance
+            # and Gerrit
+            for repo in [r.split('/')[-1] for r in repos]:
+                # Only process repos which actually exist in Gerrit,
+                # otherwise spew a warning if skipping
+                if repo not in ger_repos:
+                    print('MISSING: %s' % repo, file=sys.stderr)
+                else:
+                    # Query for an arbitrary change set and get
+                    # detailed account information about the most
+                    # recent patchset, paginating at 100 changes
+                    offset = 0
+                    changes = []
+                    while offset >= 0:
+                        changes += query_gerrit(
+                            'changes/?q=project:%s+%s&n=100&start=%s'
+                            '&o=CURRENT_COMMIT&o=CURRENT_REVISION'
+                            '&o=DETAILED_ACCOUNTS'
+                            % (ger_repos[repo], match, offset))
+                        if changes and changes[-1].get('_more_changes', False):
+                            offset += 100
+                        else:
+                            offset = -1
+
+                    # Iterate over each matched change in the repo
+                    for change in changes:
+                        # Get the merge date and skip if it's
+                        # outside any requested date range
+                        merged = date_merged(change, after, before)
+                        if not merged:
+                            continue
+
+                        # We index owners by their unique Gerrit
+                        # account Id numbers
+                        owner = change['owner']['_account_id']
+
+                        # If this owner is in the blacklist of Ids
+                        # to skip, then move on to the next change
+                        if owner in ignore:
+                            continue
+
+                        # Seen this owner already?
+                        new_owner = owner
+                        new = False
+                        if owner in duplicates:
+                            owner = duplicates[owner]
+                        elif owner not in owners:
+                            new = True
+
+                        # For new additions, initialize this as
+                        # their first and record specific account
+                        # details
+                        if new:
+                            # Get the set of all E-mail addresses
+                            # Gerrit knows for this owner's account
+                            emails = query_gerrit(
+                                'accounts/%s/emails'
+                                % change['owner']['_account_id'])
+
+                            # Find duplicate addresses and merge
+                            # accounts when that happens
+                            for email in emails:
+                                address = normalize_email(email['email'])
+                                if address in all_emails:
+                                    owner = all_emails[address]
+                                    duplicates[new_owner] = owner
+                                    print(
+                                        'MERGING DUPLICATE ACCOUNT: %s into %s'
+                                        % (new_owner, owner), file=sys.stderr)
+                                    break
+
+                        # For newly found non-duplicate owners,
+                        # initialize the global change count,
+                        # newest/oldest merged dates, and an empty
+                        # list where extra E-mail addresses can be
+                        # added; also track their full name and
+                        # Gerrit username
+                        if new and owner == new_owner:
+                            # TODO(fungi): this is a prime candidate
+                            # to become a struct, or maybe a class
+                            owners[owner] = {
+                                'count': 1,
+                                'extra': [],
+                                'name': change['owner'].get('name'),
+                                'newest': merged,
+                                'oldest': merged,
+                                'username': change['owner'].get('username'),
+                            }
+
+                        # If we've seen this owner on another change
+                        # in any repo then just iterate their global
+                        # change counter and update newest/oldest
+                        # dates
+                        else:
+                            owners[owner]['count'] += 1
+                            if merged > owners[owner]['newest']:
+                                owners[owner]['newest'] = merged
+                            elif merged < owners[owner]['oldest']:
+                                owners[owner]['oldest'] = merged
+
+                        # We only want to add addresses if this is a
+                        # new owner or a new duplicate
+                        if new:
+                            # Iterate over each E-mail address
+                            for email in emails:
+                                # Normalize the address before
+                                # performing any matching since
+                                # Gerrit doesn't do a great job of
+                                # this on its own
+                                address = normalize_email(email['email'])
+
+                                # Track this in the full list of all
+                                # known E-mail addresses
+                                all_emails[address] = owner
+
+                                # Whether Gerrit considers this the
+                                # preferred E-mail address
+                                preferred = email.get('preferred', False)
+
+                                # Store the preferred E-mail address
+                                # under its own key since it has a
+                                # special status, but only if this
+                                # is not a duplicate account
+                                if preferred and owner == new_owner:
+                                    owners[owner]['preferred'] = address
+
+                                    # If this was already added to
+                                    # the extras list due to an
+                                    # additional pre-normalized
+                                    # copy, remove it there
+                                    if address in owners[owner]['extra']:
+                                        owners[owner]['extra'].remove(address)
+
+                                # Store a list of non-preferred
+                                # addresses, deduplicating them in
+                                # case they match post-normalization
+                                # and treating duplicate preferred
+                                # addresses as # non-preferred
+                                else:
+                                    if ((address not in owners[owner]['extra'])
+                                            and (address != owners[owner].get(
+                                                'preferred', ''))):
+                                        owners[owner]['extra'].append(address)
+
+                        # If we've seen this owner on another change
+                        # in a repo under this project-team then
+                        # just iterate their team change counter and
+                        # update newest/oldest dates
+                        if owner in projects[project]:
+                            projects[project][owner]['count'] += 1
+                            if merged > projects[project][owner]['newest']:
+                                projects[project][owner]['newest'] = merged
+                            elif merged < projects[project][owner]['oldest']:
+                                projects[project][owner]['oldest'] = merged
+
+                        # ...otherwise initialize this as their
+                        # first
+                        else:
+                            # TODO(fungi): another potential struct
+                            projects[project][owner] = {
+                                'count': 1,
+                                'newest': merged,
+                                'oldest': merged,
+                            }
+
+    # The negative counter will be used as a makeshift account Id
+    # for non-code contributors; those with owned changes use their
+    # Gerrit account Id instead
+    counter = 1
+
+    # Use the before time as the only contribution time for non-code
+    # contributors, falling back on the script start time if before
+    # was not specified
+    if before:
+        if len(before) == 10:
+            stamp = before + ' 00:00:00'
+        else:
+            stamp = before
+    else:
+        stamp = start.isoformat(sep=' ').split('.')[0]
+
+    # Iterate over all extra-atcs entries
+    if not no_extra_atcs:
+        for project in gov_projects:
+            for extra_atc in gov_projects[project].get('extra-atcs', []):
+                name = extra_atc['name']
+                email = extra_atc['email']
+                address = normalize_email(email)
+                if address in all_emails:
+                    owner = all_emails[address]
+                else:
+                    owner = -counter
+                    all_emails[address] = owner
+                    owners[owner] = {
+                        'count': -1,
+                        'extra': [],
+                        'name': name,
+                        'newest': stamp,
+                        'oldest': stamp,
+                        'preferred': address,
+                        'username': '_non_code_contributor',
+                    }
+                if owner not in projects[project]:
+                    projects[project][owner] = {
+                        'count': -1,
+                        'newest': stamp,
+                        'oldest': stamp,
+                    }
+                counter += 1
+
+    # This will hold an address list for TC electorate rolls
+    electorate = []
+
+    # A table of owners for summit invites
+    invites = []
+
+    # A fresh pass through the owners to build some other datasets
+    for owner in owners:
+        # Sort extra E-mail address lists for ease of comparison
+        owners[owner]['extra'].sort()
+
+        # Build the data used for an invite
+        invite = []
+        if 'username' not in owners[owner] or not owners[owner]['username']:
+            print(
+                'SKIPPING MALFORMED OWNER: no username found for account %s' %
+                owner, file=sys.stderr)
+            continue
+        invite.append(owners[owner]['username'])
+        if 'name' not in owners[owner] or not owners[owner]['name']:
+            print(
+                'SKIPPING MALFORMED OWNER: no fullname found for account %s' %
+                owner, file=sys.stderr)
+            continue
+        invite.append(owners[owner]['name'].encode('utf-8'))
+        if 'preferred' not in owners[owner]:
+            if 'extra' in owners[owner] and owners[owner]['extra']:
+                owners[owner]['preferred'] = owners[owner]['extra'][0]
+                owners[owner]['extra'] = owners[owner]['extra'][1:]
+                print(
+                    'MISSING PREFERRED EMAIL: used first extra address as '
+                    'account %s preferred' % owner, file=sys.stderr)
+            else:
+                print(
+                    'SKIPPING MALFORMED OWNER: no preferred or extra '
+                    'addresses found for account %s' % owner, file=sys.stderr)
+                continue
+        invite.append(owners[owner]['preferred'])
+        invite += owners[owner]['extra']
+        invites.append(invite)
+
+        # Append preferred addresses to the TC electorate
+        electorate.append(owners[owner]['preferred'] + '\n')
+
+    # Write out a YAML file covering all change owners
+    fd = open(os.path.join(outdir, '_all_owners.yaml'), 'w')
+    dumper(owners, stream=fd)
+    fd.close()
+
+    # Write out a YAML file covering tracked duplicate accounts
+    fd = open(os.path.join(outdir, '_duplicate_owners.yaml'), 'w')
+    dumper(duplicates, stream=fd)
+    fd.close()
+
+    # Write out a team-specific electoral roll for CIVS
+    fd = open(os.path.join(outdir, '_electorate.txt'), 'w')
+    fd.writelines(electorate)
+    fd.close()
+
+    # Write out a CSV file appropriate for the invite2summit tool
+    fd = open(os.path.join(outdir, '_invites.csv'), 'w')
+    csv.writer(fd).writerows(invites)
+    fd.close()
+
+    # Make another pass through the projects so they can be dumped
+    # to our output files
+    for project in projects:
+
+        # This will hold team-specific info for writing
+        output = {}
+
+        # This will hold an address list for PTL electoral rolls
+        electorate = []
+
+        # Use a normalized project name for output file names
+        normalized_project = normalize_project(project)
+
+        # Iterate over each change owner for the current team
+        for owner in projects[project]:
+            # Copy the global owner details into our output since
+            # we're going to modify some
+            output[owner] = dict(owners[owner])
+
+            # Replace the owner change count and newest/oldest
+            # merged dates with the team-specific value rather than
+            # using the count from the global set
+            for field in ('count', 'newest', 'oldest'):
+                output[owner][field] = projects[project][owner][field]
+
+            # Append preferred addresses to the PTL electoral rolls
+            electorate.append(owners[owner]['preferred'] + '\n')
+
+        # Write out a team-specific YAML file
+        fd = open(os.path.join(outdir, '%s.yaml' % normalized_project), 'w')
+        dumper(output, stream=fd)
+        fd.close()
+
+        # Write out a team-specific electoral roll for CIVS
+        fd = open(os.path.join(outdir, '%s.txt' % normalized_project), 'w')
+        fd.writelines(electorate)
+        fd.close()
+
+main()