system-config/tools/owners.py
Jeremy Stanley 9341118edd Use foundation member IDs in invitations list
Since the foundation sometimes needs to deduplicate subsequent
invitations for the same foundation member against invitations sent
by another system, use the member ID as the first field of the
invite instead of using the Gerrit username now that we have that
information available. Use a default member ID of 0 to indicate
non-member invitees who may require additional manual deduplication
or vetting. Also log the member ID in the output of the invite mass
mailer script.

Change-Id: I86d879a5f06144a0889eb852f2cf3d555a12a7aa
2017-08-03 20:20:30 +00:00

725 lines
28 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2016 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS
# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language
# governing permissions and limitations under the License.
# Description: When run using OpenStack's Gerrit server, this builds
# YAML representations of aggregate change owner details and change
# counts for each governance project-team, as well as a combined set
# for all teams.
# Rationale: The OpenStack Technical Committee and Project Team Lead
# elections need electorate rolls taken from "Active Technical
# Contributors" to any repos under official project-teams over a
# particular timeframe. Similarly, the OpenStack Foundation gives
# summit registration discount codes to contributors meeting similar
# criteria. The Gerrit REST API provides access to all the data
# necessary to identify these individuals.
# Use: The results end up in files named for each
# official governance project-team (or "all") ending with a .yaml
# extension. At the time of writing, it takes approximately 30
# minutes to run on a well-connected machine with 70-80ms round-trip
# latency to review.openstack.org.
# An example for generating the March 2016 technical election rolls:
#
# $ virtualenv venv
# [...]
# $ ./venv/bin/pip install pyyaml requests
# [...]
# $ ./venv/bin/python tools/owners.py -a 2015-03-04 \
# -b 2016-03-04 -i 11131 -i 22816 -o owners \
# -r march-2016-elections
# MISSING: ansible-build-image
# MERGING DUPLICATE ACCOUNT: 8074 into 2467
# [...blah, blah, blah...wait for completion...]
#
# TODO(fungi): Add a pass which will correctly generate the
# stable_branch_maintenance.* files. In the meantime, to properly
# generate the SBM PTL electorate, run a second time with a
# different -o of sbm, adding the -n and -s options, and then copy
# the full electorate over like:
#
# $ ./venv/bin/python tools/owners.py -a 2015-03-04 \
# -b 2016-03-04 -i 11131 -i 22816 -o sbm \
# -r march-2016-elections -n -s 'branch:^stable/.*'
# [...wait for completion again...]
# $ cp sbm/_electorate.txt owners/stable_branch_maintenance.txt
# $ cp sbm/_all_owners.yaml owners/stable_branch_maintenance.yaml
#
# Once complete, make a compressed tarball of the owners directory
# and send it attached to a PGP/MIME signed message to the appointed
# election officials. The various *.txt files are lists of the
# preferred addresses of all valid voters for the various PTL
# elections (whose team names correspond to the file names),
# suitable for passing directly to CIVS. The similarly named *.yaml
# files are detailed structured data about the same sets of voters,
# for use in validating the address lists. The _electorate.txt file
# is the equivalent address list for the TC election voters, and its
# corresponding structured data is in _all_owners.yaml.
# You can also do interesting analysis on _all_owners.yaml, for
# example:
#
# $ ./venv/bin/python
# >>> import yaml
# >>>
# >>> o = yaml.load(open('owners/_all_owners.yaml'))
# >>> for c in range(5):
# ... print('Owners of at least %s changes: %s' % (
# ... c+1,
# ... len({k: v for k, v in o.iteritems() if v['count'] > c})))
# ...
# Owners of at least 1 changes: 3239
# Owners of at least 2 changes: 2352
# Owners of at least 3 changes: 1924
# Owners of at least 4 changes: 1682
# Owners of at least 5 changes: 1504
from __future__ import print_function
import argparse
import csv
import datetime
import json
import os
import sys
import requests
import yaml
try:
from string import maketrans
except ImportError: # Python3
maketrans = bytes.maketrans
def dumper(data, stream):
"""Convenience wrapper to consistently set YAML formatting"""
return yaml.safe_dump(data, allow_unicode=True, default_flow_style=False,
encoding='utf-8', stream=stream)
def normalize_email(email):
"""Lower-case the domain part of E-mail addresses to better spot
duplicate entries, since the domain part is case-insensitive
courtesy of DNS while the local part is not necessarily"""
local, domain = email.split('@')
domain = domain.lower()
return '%s@%s' % (local, domain)
def normalize_project(project):
"""Replace spaces and hyphens with underscores in project teams
and then lower-case them, for more convenient filenames"""
return project.translate(maketrans(' -', '__')).lower()
def date_merged(change, after=None, before=None):
"""Determine the date and time a specific change merged"""
date = change.get('submitted', None)
if not date:
# Something's terribly wrong with any changes matching this now
print(
'SKIPPING DATELESS MERGE: change %s for account %s'
% (change['_number'], change['owner']['_account_id']),
file=sys.stderr)
return None
# Strip superfluous subsecond values as Gerrit always just
# reports .000000000 for them anyway
date = date.split('.')[0]
# Pass back an invalid result if it falls after the requested
# cutoff
if before and date >= before:
return None
# Sanity check for completeness, but since "after" is also used
# in the Gerrit query this shouldn't ever actually be reached
if after and date < after:
return None
return date
def requester(url, params={}, headers={}):
"""A requests wrapper to consistently retry HTTPS queries"""
# Try up to 3 times
retry = requests.Session()
retry.mount("https://", requests.adapters.HTTPAdapter(max_retries=3))
return retry.get(url=url, params=params, headers=headers)
def decode_json(raw):
"""Trap JSON decoding failures and provide more detailed errors"""
# Gerrit's REST API prepends a JSON-breaker to avoid XSS vulnerabilities
if raw.text.startswith(")]}'"):
trimmed = raw.text[4:]
else:
trimmed = raw.text
# Try to decode and bail with much detail if it fails
try:
decoded = json.loads(trimmed)
except:
print('\nrequest returned %s error to query:\n\n %s\n'
'\nwith detail:\n\n %s\n' % (raw, raw.url, trimmed),
file=sys.stderr)
raise
return decoded
def query_gerrit(method, params={}):
"""Query the Gerrit REST API"""
# The base URL to Gerrit, for accessing both GitWeb and the REST
# API
GERRIT_API_URL = 'https://review.openstack.org/'
raw = requester(GERRIT_API_URL + method, params=params,
headers={'Accept': 'application/json'})
# Gitweb queries don't return JSON, so just pass them along as-is
if method is 'gitweb':
raw.encoding = 'utf-8' # Workaround for Gitweb encoding
return yaml.safe_load(raw.text)
else:
return decode_json(raw)
def lookup_member(email):
"""A requests wrapper to querying the OSF member directory API"""
# The OpenStack foundation member directory lookup API endpoint
MEMBER_LOOKUP_URL = 'https://openstackid-resources.openstack.org/'
# URL pattern for querying foundation members by E-mail address
raw = requester(
MEMBER_LOOKUP_URL + '/api/public/v1/members',
params={'filter[]': [
'group_slug==foundation-members',
'email==' + email,
]},
headers={'Accept': 'application/json'},
)
return decode_json(raw)
def usage(argv):
"""Parse command line argument"""
parser = argparse.ArgumentParser(
description="When run using OpenStack's Gerrit server, this builds "
"YAML representations of aggregate change owner details and change "
"counts for each governance project-team, as well as a combined set "
"for all teams. Before and after dates/times should be supplied in "
"formats Gerrit accepts: https://review.openstack.org/Documentation/"
"user-search.html#search-operators")
parser.add_argument("-a", "--after", help="Start date for matching merges")
parser.add_argument("-b", "--before", help="End date for matching merges")
parser.add_argument("-c", "--config", help="Path to script configuration")
parser.add_argument("-i", "--ignore", help="Account Id numbers to skip",
action='append')
parser.add_argument("-n", "--no-extra-atcs", help='Omit "extra ATCs"',
dest='no_extra_atcs', action='store_true')
parser.add_argument("-o", "--outdir", help="Create an output directory")
parser.add_argument("-r", "--ref", help="Specify a Governance refname")
parser.add_argument("-s", "--sieve", help="Add Gerrit query parameters")
return parser.parse_args(argv[1:])
def main(argv=sys.argv):
"""The giant pile of spaghetti which does everything else"""
# Record the start time for use later
start = datetime.datetime.utcnow()
options = usage(argv)
# If we're supplied a configuration file, use it
if options.config:
config = yaml.safe_load(open(options.config))
# Otherwise, use nothing
else:
config = {}
# Start of the match timeframe for change merges
if options.after:
after = options.after
elif 'after' in config:
after = config['after']
else:
after = None
# End of the match timeframe for change merges
if options.before:
before = options.before
elif 'before' in config:
before = config['before']
else:
before = None
# Owner Ids for whom to ignore changes
if options.ignore:
ignore = [int(i) for i in options.ignore]
elif 'ignore' in config:
ignore = config['ignore']
else:
ignore = []
# Whether to omit "extra ATCs"
if options.no_extra_atcs:
no_extra_atcs = options.no_extra_atcs
elif 'no-extra-atcs' in config:
no_extra_atcs = config['no-extra-atcs']
else:
no_extra_atcs = False
# Output file directory
if options.outdir:
outdir = options.outdir
elif 'outdir' in config:
outdir = config['outdir']
else:
outdir = '.'
if not os.path.isdir(outdir):
os.makedirs(outdir)
# Governance Git repository ref object for reference lists
if options.ref:
ref = options.ref
elif 'ref' in config:
ref = config['ref']
else:
ref = 'refs/heads/master'
# Gerrit change query additions
if options.sieve:
sieve = options.sieve
elif 'sieve' in config:
sieve = config['sieve']
else:
sieve = None
# The query identifying relevant changes
match = 'status:merged'
if after:
match = '%s after:"%s"' % (match, after)
if sieve:
match = '%s %s' % (match, sieve)
# Retrieve the governance projects list, needs a Git refname as a
# parameter
# TODO(fungi): make this a configurable option so that you can
# for example supply a custom project list for running elections
# in unofficial teams
gov_projects = query_gerrit(
'gitweb', {
'p': 'openstack/governance.git',
'a': 'blob_plain',
'f': 'reference/projects.yaml',
'hb': ref,
})
# The set of retired or removed "legacy" projects from governance
# are merged into the main dict if their retired-on date falls
# later than the after parameter for the qualifying time period
# TODO(fungi): make this a configurable option
old_projects = query_gerrit(
'gitweb', {
'p': 'openstack/governance.git',
'a': 'blob_plain',
'f': 'reference/legacy.yaml',
'hb': ref,
})
for project in old_projects:
for deliverable in old_projects[project]['deliverables']:
if 'retired-on' in old_projects[project]['deliverables'][deliverable]:
retired = old_projects[project]['deliverables'][deliverable]['retired-on']
elif 'retired-on' in old_projects[project]:
retired = old_projects[project]['retired-on']
else:
retired = None
if retired:
retired = retired.isoformat()
if after and after > retired:
continue
if project not in gov_projects:
gov_projects[project] = {'deliverables': {}}
if deliverable in gov_projects[project]['deliverables']:
print('Skipping duplicate/partially retired deliverable: %s' % deliverable, file=sys.stderr)
continue
gov_projects[project]['deliverables'][deliverable] = old_projects[project]['deliverables'][deliverable]
# A mapping of short (no prefix) to full repo names existing in
# Gerrit, used to handle repos which have a different namespace
# in governance during transitions and also to filter out repos
# listed in governance which don't actually exist
ger_repos = dict(
[(x.split('/')[-1], x) for x in query_gerrit('projects/')])
# This will be populated with change owners mapped to the
# project-teams maintaining their respective Git repositories
projects = {}
# This will be populated with all change owners and their
# account details
owners = {}
# This will be populated with discovered duplicate owners
duplicates = {}
# This will be populated with all individual E-mail addresses of
# change owners, to facilitate finding and merging duplicate
# accounts
all_emails = {}
# Iterate over all governance project-teams only at filename
# generation time
for project in gov_projects:
# This will be populated with change owner Ids and counts
projects[project] = {}
# Governance project-teams have one or more deliverables
for deliverable in gov_projects[project]['deliverables']:
# Each deliverable can have multiple repos
repos = gov_projects[project]['deliverables'][deliverable]['repos']
# Operate on repo short-names (no namespace) to avoid
# potential namespace mismatches between governance
# and Gerrit
for repo in [r.split('/')[-1] for r in repos]:
# Only process repos which actually exist in Gerrit,
# otherwise spew a warning if skipping
if repo not in ger_repos:
print('MISSING: %s' % repo, file=sys.stderr)
else:
# Query for an arbitrary change set and get
# detailed account information about the most
# recent patchset, paginating at 100 changes
offset = 0
changes = []
while offset >= 0:
changes += query_gerrit('changes/', params={
'q': 'project:%s %s' % (ger_repos[repo], match),
'n': '100',
'start': offset,
'o': [
'CURRENT_COMMIT',
'CURRENT_REVISION',
'DETAILED_ACCOUNTS',
],
})
if changes and changes[-1].get('_more_changes', False):
offset += 100
else:
offset = -1
# Iterate over each matched change in the repo
for change in changes:
# Get the merge date and skip if it's
# outside any requested date range
merged = date_merged(change, after, before)
if not merged:
continue
# We index owners by their unique Gerrit
# account Id numbers
owner = change['owner']['_account_id']
# If this owner is in the blacklist of Ids
# to skip, then move on to the next change
if owner in ignore:
continue
# Seen this owner already?
new_owner = owner
new = False
if owner in duplicates:
owner = duplicates[owner]
elif owner not in owners:
new = True
# For new additions, initialize this as
# their first and record specific account
# details
if new:
# Get the set of all E-mail addresses
# Gerrit knows for this owner's account
emails = query_gerrit(
'accounts/%s/emails'
% change['owner']['_account_id'])
# Find duplicate addresses and merge
# accounts when that happens
for email in emails:
address = normalize_email(email['email'])
if address in all_emails:
owner = all_emails[address]
duplicates[new_owner] = owner
print(
'MERGING DUPLICATE ACCOUNT: %s into %s'
% (new_owner, owner), file=sys.stderr)
break
# For newly found non-duplicate owners,
# initialize the global change count,
# newest/oldest merged dates, and an empty
# list where extra E-mail addresses can be
# added; also track their full name and
# Gerrit username
if new and owner == new_owner:
# TODO(fungi): this is a prime candidate
# to become a struct, or maybe a class
owners[owner] = {
'count': 1,
'extra': [],
'name': change['owner'].get('name'),
'newest': merged,
'oldest': merged,
'username': change['owner'].get('username'),
}
# If we've seen this owner on another change
# in any repo then just iterate their global
# change counter and update newest/oldest
# dates
else:
owners[owner]['count'] += 1
if merged > owners[owner]['newest']:
owners[owner]['newest'] = merged
elif merged < owners[owner]['oldest']:
owners[owner]['oldest'] = merged
# We only want to add addresses if this is a
# new owner or a new duplicate
if new:
# Iterate over each E-mail address
for email in emails:
# Normalize the address before
# performing any matching since
# Gerrit doesn't do a great job of
# this on its own
address = normalize_email(email['email'])
# Track this in the full list of all
# known E-mail addresses
all_emails[address] = owner
# Whether Gerrit considers this the
# preferred E-mail address
preferred = email.get('preferred', False)
# Store the preferred E-mail address
# under its own key since it has a
# special status, but only if this
# is not a duplicate account
if preferred and owner == new_owner:
owners[owner]['preferred'] = address
# If this was already added to
# the extras list due to an
# additional pre-normalized
# copy, remove it there
if address in owners[owner]['extra']:
owners[owner]['extra'].remove(address)
# Store a list of non-preferred
# addresses, deduplicating them in
# case they match post-normalization
# and treating duplicate preferred
# addresses as # non-preferred
else:
if ((address not in owners[owner]['extra'])
and (address != owners[owner].get(
'preferred', ''))):
owners[owner]['extra'].append(address)
# If we've seen this owner on another change
# in a repo under this project-team then
# just iterate their team change counter and
# update newest/oldest dates
if owner in projects[project]:
projects[project][owner]['count'] += 1
if merged > projects[project][owner]['newest']:
projects[project][owner]['newest'] = merged
elif merged < projects[project][owner]['oldest']:
projects[project][owner]['oldest'] = merged
# ...otherwise initialize this as their
# first
else:
# TODO(fungi): another potential struct
projects[project][owner] = {
'count': 1,
'newest': merged,
'oldest': merged,
}
# The negative counter will be used as a makeshift account Id
# for non-code contributors; those with owned changes use their
# Gerrit account Id instead
counter = 1
# Use the before time as the only contribution time for non-code
# contributors, falling back on the script start time if before
# was not specified
if before:
if len(before) == 10:
stamp = before + ' 00:00:00'
else:
stamp = before
else:
stamp = start.isoformat(sep=' ').split('.')[0]
# Iterate over all extra-atcs entries
if not no_extra_atcs:
for project in gov_projects:
for extra_atc in gov_projects[project].get('extra-atcs', []):
name = extra_atc['name']
email = extra_atc['email']
address = normalize_email(email)
if address in all_emails:
owner = all_emails[address]
else:
owner = -counter
all_emails[address] = owner
owners[owner] = {
'count': -1,
'extra': [],
'name': name,
'newest': stamp,
'oldest': stamp,
'preferred': address,
'username': '_non_code_contributor',
}
if owner not in projects[project]:
projects[project][owner] = {
'count': -1,
'newest': stamp,
'oldest': stamp,
}
counter += 1
# This will hold an address list for TC electorate rolls
electorate = []
# A table of owners for summit invites
invites = []
# A fresh pass through the owners to build some other datasets
for owner in owners:
# Sort extra E-mail address lists for ease of comparison
owners[owner]['extra'].sort()
# Build the data used for an invite
if 'name' not in owners[owner] or not owners[owner]['name']:
print(
'SKIPPING MALFORMED OWNER: no fullname found for account %s' %
owner, file=sys.stderr)
continue
if 'preferred' not in owners[owner]:
if 'extra' in owners[owner] and owners[owner]['extra']:
owners[owner]['preferred'] = owners[owner]['extra'][0]
owners[owner]['extra'] = owners[owner]['extra'][1:]
print(
'MISSING PREFERRED EMAIL: used first extra address as '
'account %s preferred' % owner, file=sys.stderr)
else:
print(
'SKIPPING MALFORMED OWNER: no preferred or extra '
'addresses found for account %s' % owner, file=sys.stderr)
continue
for email in [owners[owner]['preferred']] + owners[owner]['extra']:
member = lookup_member(email)
if member['data']:
owners[owner]['member'] = member['data'][0]['id']
continue
invite = [owners[owner].get('member','0')]
invite.append(owners[owner]['name'].encode('utf-8'))
invite.append(owners[owner]['preferred'])
invite += owners[owner]['extra']
invites.append(invite)
# Append preferred addresses to the TC electorate for members only
if 'member' in owners[owner]:
electorate.append(owners[owner]['preferred'] + '\n')
# Write out a YAML file covering all change owners
fd = open(os.path.join(outdir, '_all_owners.yaml'), 'w')
dumper(owners, stream=fd)
fd.close()
# Write out a YAML file covering tracked duplicate accounts
fd = open(os.path.join(outdir, '_duplicate_owners.yaml'), 'w')
dumper(duplicates, stream=fd)
fd.close()
# Write out a team-specific electoral roll for CIVS
fd = open(os.path.join(outdir, '_electorate.txt'), 'w')
fd.writelines(electorate)
fd.close()
# Write out a CSV file appropriate for the invite2summit tool
fd = open(os.path.join(outdir, '_invites.csv'), 'w')
csv.writer(fd).writerows(invites)
fd.close()
# Make another pass through the projects so they can be dumped
# to our output files
for project in projects:
# This will hold team-specific info for writing
output = {}
# This will hold an address list for PTL electoral rolls
electorate = []
# Use a normalized project name for output file names
normalized_project = normalize_project(project)
# Iterate over each change owner for the current team
for owner in projects[project]:
# Copy the global owner details into our output since
# we're going to modify some
output[owner] = dict(owners[owner])
# Replace the owner change count and newest/oldest
# merged dates with the team-specific value rather than
# using the count from the global set
for field in ('count', 'newest', 'oldest'):
output[owner][field] = projects[project][owner][field]
# Append preferred addresses to the PTL electoral rolls
electorate.append(owners[owner]['preferred'] + '\n')
# Write out a team-specific YAML file
fd = open(os.path.join(outdir, '%s.yaml' % normalized_project), 'w')
dumper(output, stream=fd)
fd.close()
# Write out a team-specific electoral roll for CIVS
fd = open(os.path.join(outdir, '%s.txt' % normalized_project), 'w')
fd.writelines(electorate)
fd.close()
if __name__ == "__main__":
main()