#!/usr/bin/env python # Copyright (c) 2016 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS # IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language # governing permissions and limitations under the License. # Description: When run using OpenStack's Gerrit server, this builds # YAML representations of aggregate change owner details and change # counts for each governance project-team, as well as a combined set # for all teams. # Rationale: The OpenStack Technical Committee and Project Team Lead # elections need electorate rolls taken from "Active Technical # Contributors" to any repos under official project-teams over a # particular timeframe. Similarly, the OpenStack Foundation gives # summit registration discount codes to contributors meeting similar # criteria. The Gerrit REST API provides access to all the data # necessary to identify these individuals. # Use: The results end up in files named for each # official governance project-team (or "all") ending with a .yaml # extension. At the time of writing, it takes approximately 30 # minutes to run on a well-connected machine with 70-80ms round-trip # latency to review.openstack.org. # An example for generating the March 2016 technical election rolls: # # $ virtualenv venv # [...] # $ ./venv/bin/pip install pyyaml requests # [...] # $ ./venv/bin/python tools/owners.py -a 2015-03-04 \ # -b 2016-03-04 -i 11131 -i 22816 -o owners \ # -r march-2016-elections # MISSING: ansible-build-image # MERGING DUPLICATE ACCOUNT: 8074 into 2467 # [...blah, blah, blah...wait for completion...] # # TODO(fungi): Add a pass which will correctly generate the # stable_branch_maintenance.* files. In the meantime, to properly # generate the SBM PTL electorate, run a second time with a # different -o of sbm, adding the -n and -s options, and then copy # the full electorate over like: # # $ ./venv/bin/python tools/owners.py -a 2015-03-04 \ # -b 2016-03-04 -i 11131 -i 22816 -o sbm \ # -r march-2016-elections -n -s 'branch:^stable/.*' # [...wait for completion again...] # $ cp sbm/_electorate.txt owners/stable_branch_maintenance.txt # $ cp sbm/_all_owners.yaml owners/stable_branch_maintenance.yaml # # Once complete, make a compressed tarball of the owners directory # and send it attached to a PGP/MIME signed message to the appointed # election officials. The various *.txt files are lists of the # preferred addresses of all valid voters for the various PTL # elections (whose team names correspond to the file names), # suitable for passing directly to CIVS. The similarly named *.yaml # files are detailed structured data about the same sets of voters, # for use in validating the address lists. The _electorate.txt file # is the equivalent address list for the TC election voters, and its # corresponding structured data is in _all_owners.yaml. # You can also do interesting analysis on _all_owners.yaml, for # example: # # $ ./venv/bin/python # >>> import yaml # >>> # >>> o = yaml.load(open('owners/_all_owners.yaml')) # >>> for c in range(5): # ... print('Owners of at least %s changes: %s' % ( # ... c+1, # ... len({k: v for k, v in o.iteritems() if v['count'] > c}))) # ... # Owners of at least 1 changes: 3239 # Owners of at least 2 changes: 2352 # Owners of at least 3 changes: 1924 # Owners of at least 4 changes: 1682 # Owners of at least 5 changes: 1504 from __future__ import print_function import argparse import csv import datetime import json import os import sys import requests import yaml try: from string import maketrans except ImportError: # Python3 maketrans = bytes.maketrans def dumper(data, stream): """Convenience wrapper to consistently set YAML formatting""" return yaml.safe_dump(data, allow_unicode=True, default_flow_style=False, encoding='utf-8', stream=stream) def normalize_email(email): """Lower-case the domain part of E-mail addresses to better spot duplicate entries, since the domain part is case-insensitive courtesy of DNS while the local part is not necessarily""" local, domain = email.split('@') domain = domain.lower() return '%s@%s' % (local, domain) def normalize_project(project): """Replace spaces and hyphens with underscores in project teams and then lower-case them, for more convenient filenames""" return project.translate(maketrans(' -', '__')).lower() def date_merged(change, after=None, before=None): """Determine the date and time a specific change merged""" date = change.get('submitted', None) if not date: # Something's terribly wrong with any changes matching this now print( 'SKIPPING DATELESS MERGE: change %s for account %s' % (change['_number'], change['owner']['_account_id']), file=sys.stderr) return None # Strip superfluous subsecond values as Gerrit always just # reports .000000000 for them anyway date = date.split('.')[0] # Pass back an invalid result if it falls after the requested # cutoff if before and date >= before: return None # Sanity check for completeness, but since "after" is also used # in the Gerrit query this shouldn't ever actually be reached if after and date < after: return None return date def requester(url, params={}, headers={}): """A requests wrapper to consistently retry HTTPS queries""" # Try up to 3 times retry = requests.Session() retry.mount("https://", requests.adapters.HTTPAdapter(max_retries=3)) return retry.get(url=url, params=params, headers=headers) def decode_json(raw): """Trap JSON decoding failures and provide more detailed errors""" # Gerrit's REST API prepends a JSON-breaker to avoid XSS vulnerabilities if raw.text.startswith(")]}'"): trimmed = raw.text[4:] else: trimmed = raw.text # Try to decode and bail with much detail if it fails try: decoded = json.loads(trimmed) except: print('\nrequest returned %s error to query:\n\n %s\n' '\nwith detail:\n\n %s\n' % (raw, raw.url, trimmed), file=sys.stderr) raise return decoded def query_gerrit(method, params={}): """Query the Gerrit REST API""" # The base URL to Gerrit REST API GERRIT_API_URL = 'https://review.openstack.org/' raw = requester(GERRIT_API_URL + method, params=params, headers={'Accept': 'application/json'}) return decode_json(raw) def get_from_cgit(project, obj, params={}): """Retrieve a file from the cgit interface""" url = 'http://git.openstack.org/cgit/' + project + '/plain/' + obj raw = requester(url, params=params, headers={'Accept': 'application/json'}) return yaml.safe_load(raw.text) def lookup_member(email): """A requests wrapper to querying the OSF member directory API""" # The OpenStack foundation member directory lookup API endpoint MEMBER_LOOKUP_URL = 'https://openstackid-resources.openstack.org/' # URL pattern for querying foundation members by E-mail address raw = requester( MEMBER_LOOKUP_URL + '/api/public/v1/members', params={'filter[]': [ 'group_slug==foundation-members', 'email==' + email, ]}, headers={'Accept': 'application/json'}, ) return decode_json(raw) def usage(argv): """Parse command line argument""" parser = argparse.ArgumentParser( description="When run using OpenStack's Gerrit server, this builds " "YAML representations of aggregate change owner details and change " "counts for each governance project-team, as well as a combined set " "for all teams. Before and after dates/times should be supplied in " "formats Gerrit accepts: https://review.openstack.org/Documentation/" "user-search.html#search-operators") parser.add_argument("-a", "--after", help="Start date for matching merges") parser.add_argument("-b", "--before", help="End date for matching merges") parser.add_argument("-c", "--config", help="Path to script configuration") parser.add_argument("-i", "--ignore", help="Account Id numbers to skip", action='append') parser.add_argument("-n", "--no-extra-atcs", help='Omit "extra ATCs"', dest='no_extra_atcs', action='store_true') parser.add_argument("-o", "--outdir", help="Create an output directory") parser.add_argument("-r", "--ref", help="Specify a Governance refname") parser.add_argument("-s", "--sieve", help="Add Gerrit query parameters") return parser.parse_args(argv[1:]) def main(argv=sys.argv): """The giant pile of spaghetti which does everything else""" # Record the start time for use later start = datetime.datetime.utcnow() options = usage(argv) # If we're supplied a configuration file, use it if options.config: config = yaml.safe_load(open(options.config)) # Otherwise, use nothing else: config = {} # Start of the match timeframe for change merges if options.after: after = options.after elif 'after' in config: after = config['after'] else: after = None # End of the match timeframe for change merges if options.before: before = options.before elif 'before' in config: before = config['before'] else: before = None # Owner Ids for whom to ignore changes if options.ignore: ignore = [int(i) for i in options.ignore] elif 'ignore' in config: ignore = config['ignore'] else: ignore = [] # Whether to omit "extra ATCs" if options.no_extra_atcs: no_extra_atcs = options.no_extra_atcs elif 'no-extra-atcs' in config: no_extra_atcs = config['no-extra-atcs'] else: no_extra_atcs = False # Output file directory if options.outdir: outdir = options.outdir elif 'outdir' in config: outdir = config['outdir'] else: outdir = '.' if not os.path.isdir(outdir): os.makedirs(outdir) # Governance Git repository ref object for reference lists if options.ref: ref = options.ref elif 'ref' in config: ref = config['ref'] else: ref = 'refs/heads/master' # Gerrit change query additions if options.sieve: sieve = options.sieve elif 'sieve' in config: sieve = config['sieve'] else: sieve = None # The query identifying relevant changes match = 'status:merged' if after: match = '%s after:"%s"' % (match, after) if sieve: match = '%s %s' % (match, sieve) # Retrieve the governance projects list, needs a Git refname as a # parameter # TODO(fungi): make this a configurable option so that you can # for example supply a custom project list for running elections # in unofficial teams gov_projects = get_from_cgit('openstack/governance', 'reference/projects.yaml', {'h': ref}) # The set of retired or removed "legacy" projects from governance # are merged into the main dict if their retired-on date falls # later than the after parameter for the qualifying time period # TODO(fungi): make this a configurable option old_projects = get_from_cgit('openstack/governance', 'reference/legacy.yaml', {'h': ref}) for project in old_projects: for deliverable in old_projects[project]['deliverables']: if 'retired-on' in old_projects[project]['deliverables'][deliverable]: retired = old_projects[project]['deliverables'][deliverable]['retired-on'] elif 'retired-on' in old_projects[project]: retired = old_projects[project]['retired-on'] else: retired = None if retired: retired = retired.isoformat() if after and after > retired: continue if project not in gov_projects: gov_projects[project] = {'deliverables': {}} if deliverable in gov_projects[project]['deliverables']: print('Skipping duplicate/partially retired deliverable: %s' % deliverable, file=sys.stderr) continue gov_projects[project]['deliverables'][deliverable] = old_projects[project]['deliverables'][deliverable] # A mapping of short (no prefix) to full repo names existing in # Gerrit, used to handle repos which have a different namespace # in governance during transitions and also to filter out repos # listed in governance which don't actually exist ger_repos = dict( [(x.split('/')[-1], x) for x in query_gerrit('projects/')]) # This will be populated with change owners mapped to the # project-teams maintaining their respective Git repositories projects = {} # This will be populated with all change owners and their # account details owners = {} # This will be populated with discovered duplicate owners duplicates = {} # This will be populated with all individual E-mail addresses of # change owners, to facilitate finding and merging duplicate # accounts all_emails = {} # Iterate over all governance project-teams only at filename # generation time for project in gov_projects: # This will be populated with change owner Ids and counts projects[project] = {} # Governance project-teams have one or more deliverables for deliverable in gov_projects[project]['deliverables']: # Each deliverable can have multiple repos repos = gov_projects[project]['deliverables'][deliverable]['repos'] # Operate on repo short-names (no namespace) to avoid # potential namespace mismatches between governance # and Gerrit for repo in [r.split('/')[-1] for r in repos]: # Only process repos which actually exist in Gerrit, # otherwise spew a warning if skipping if repo not in ger_repos: print('MISSING: %s' % repo, file=sys.stderr) else: # Query for an arbitrary change set and get # detailed account information about the most # recent patchset, paginating at 100 changes offset = 0 changes = [] while offset >= 0: changes += query_gerrit('changes/', params={ 'q': 'project:%s %s' % (ger_repos[repo], match), 'n': '100', 'start': offset, 'o': [ 'CURRENT_COMMIT', 'CURRENT_REVISION', 'DETAILED_ACCOUNTS', ], }) if changes and changes[-1].get('_more_changes', False): offset += 100 else: offset = -1 # Iterate over each matched change in the repo for change in changes: # Get the merge date and skip if it's # outside any requested date range merged = date_merged(change, after, before) if not merged: continue # We index owners by their unique Gerrit # account Id numbers owner = change['owner']['_account_id'] # If this owner is in the blacklist of Ids # to skip, then move on to the next change if owner in ignore: continue # Seen this owner already? new_owner = owner new = False if owner in duplicates: owner = duplicates[owner] elif owner not in owners: new = True # For new additions, initialize this as # their first and record specific account # details if new: # Get the set of all E-mail addresses # Gerrit knows for this owner's account emails = query_gerrit( 'accounts/%s/emails' % change['owner']['_account_id']) # Find duplicate addresses and merge # accounts when that happens for email in emails: address = normalize_email(email['email']) if address in all_emails: owner = all_emails[address] duplicates[new_owner] = owner print( 'MERGING DUPLICATE ACCOUNT: %s into %s' % (new_owner, owner), file=sys.stderr) break # For newly found non-duplicate owners, # initialize the global change count, # newest/oldest merged dates, and an empty # list where extra E-mail addresses can be # added; also track their full name and # Gerrit username if new and owner == new_owner: # TODO(fungi): this is a prime candidate # to become a struct, or maybe a class owners[owner] = { 'count': 1, 'extra': [], 'name': change['owner'].get('name'), 'newest': merged, 'oldest': merged, 'username': change['owner'].get('username'), } # If we've seen this owner on another change # in any repo then just iterate their global # change counter and update newest/oldest # dates else: owners[owner]['count'] += 1 if merged > owners[owner]['newest']: owners[owner]['newest'] = merged elif merged < owners[owner]['oldest']: owners[owner]['oldest'] = merged # We only want to add addresses if this is a # new owner or a new duplicate if new: # Iterate over each E-mail address for email in emails: # Normalize the address before # performing any matching since # Gerrit doesn't do a great job of # this on its own address = normalize_email(email['email']) # Track this in the full list of all # known E-mail addresses all_emails[address] = owner # Whether Gerrit considers this the # preferred E-mail address preferred = email.get('preferred', False) # Store the preferred E-mail address # under its own key since it has a # special status, but only if this # is not a duplicate account if preferred and owner == new_owner: owners[owner]['preferred'] = address # If this was already added to # the extras list due to an # additional pre-normalized # copy, remove it there if address in owners[owner]['extra']: owners[owner]['extra'].remove(address) # Store a list of non-preferred # addresses, deduplicating them in # case they match post-normalization # and treating duplicate preferred # addresses as # non-preferred else: if ((address not in owners[owner]['extra']) and (address != owners[owner].get( 'preferred', ''))): owners[owner]['extra'].append(address) # If we've seen this owner on another change # in a repo under this project-team then # just iterate their team change counter and # update newest/oldest dates if owner in projects[project]: projects[project][owner]['count'] += 1 if merged > projects[project][owner]['newest']: projects[project][owner]['newest'] = merged elif merged < projects[project][owner]['oldest']: projects[project][owner]['oldest'] = merged # ...otherwise initialize this as their # first else: # TODO(fungi): another potential struct projects[project][owner] = { 'count': 1, 'newest': merged, 'oldest': merged, } # The negative counter will be used as a makeshift account Id # for non-code contributors; those with owned changes use their # Gerrit account Id instead counter = 1 # Use the before time as the only contribution time for non-code # contributors, falling back on the script start time if before # was not specified if before: if len(before) == 10: stamp = before + ' 00:00:00' else: stamp = before else: stamp = start.isoformat(sep=' ').split('.')[0] # Iterate over all extra-atcs entries if not no_extra_atcs: for project in gov_projects: for extra_atc in gov_projects[project].get('extra-atcs', []): name = extra_atc['name'] email = extra_atc['email'] address = normalize_email(email) if address in all_emails: owner = all_emails[address] else: owner = -counter all_emails[address] = owner owners[owner] = { 'count': -1, 'extra': [], 'name': name, 'newest': stamp, 'oldest': stamp, 'preferred': address, 'username': '_non_code_contributor', } if owner not in projects[project]: projects[project][owner] = { 'count': -1, 'newest': stamp, 'oldest': stamp, } counter += 1 # This will hold an address list for TC electorate rolls electorate = [] # A table of owners for summit invites invites = [] # A fresh pass through the owners to build some other datasets for owner in owners: # Sort extra E-mail address lists for ease of comparison owners[owner]['extra'].sort() # Build the data used for an invite if 'name' not in owners[owner] or not owners[owner]['name']: print( 'SKIPPING MALFORMED OWNER: no fullname found for account %s' % owner, file=sys.stderr) continue if 'preferred' not in owners[owner]: if 'extra' in owners[owner] and owners[owner]['extra']: owners[owner]['preferred'] = owners[owner]['extra'][0] owners[owner]['extra'] = owners[owner]['extra'][1:] print( 'MISSING PREFERRED EMAIL: used first extra address as ' 'account %s preferred' % owner, file=sys.stderr) else: print( 'SKIPPING MALFORMED OWNER: no preferred or extra ' 'addresses found for account %s' % owner, file=sys.stderr) continue for email in [owners[owner]['preferred']] + owners[owner]['extra']: member = lookup_member(email) if member['data']: owners[owner]['member'] = member['data'][0]['id'] continue invite = [owners[owner].get('member','0')] invite.append(owners[owner]['name'].encode('utf-8')) invite.append(owners[owner]['preferred']) invite += owners[owner]['extra'] invites.append(invite) # Append preferred addresses to the TC electorate for members only if 'member' in owners[owner]: electorate.append(owners[owner]['preferred'] + '\n') # Write out a YAML file covering all change owners fd = open(os.path.join(outdir, '_all_owners.yaml'), 'w') dumper(owners, stream=fd) fd.close() # Write out a YAML file covering tracked duplicate accounts fd = open(os.path.join(outdir, '_duplicate_owners.yaml'), 'w') dumper(duplicates, stream=fd) fd.close() # Write out a team-specific electoral roll for CIVS fd = open(os.path.join(outdir, '_electorate.txt'), 'w') fd.writelines(electorate) fd.close() # Write out a CSV file appropriate for the invite2summit tool fd = open(os.path.join(outdir, '_invites.csv'), 'w') csv.writer(fd).writerows(invites) fd.close() # Make another pass through the projects so they can be dumped # to our output files for project in projects: # This will hold team-specific info for writing output = {} # This will hold an address list for PTL electoral rolls electorate = [] # Use a normalized project name for output file names normalized_project = normalize_project(project) # Iterate over each change owner for the current team for owner in projects[project]: # Copy the global owner details into our output since # we're going to modify some output[owner] = dict(owners[owner]) # Replace the owner change count and newest/oldest # merged dates with the team-specific value rather than # using the count from the global set for field in ('count', 'newest', 'oldest'): output[owner][field] = projects[project][owner][field] # Append preferred member addresses to the PTL electoral rolls if 'member' in owners[owner]: electorate.append(owners[owner]['preferred'] + '\n') # Write out a team-specific YAML file fd = open(os.path.join(outdir, '%s.yaml' % normalized_project), 'w') dumper(output, stream=fd) fd.close() # Write out a team-specific electoral roll for CIVS fd = open(os.path.join(outdir, '%s.txt' % normalized_project), 'w') fd.writelines(electorate) fd.close() if __name__ == "__main__": main()