Add tools being used to make sense of gerrit account inconsistencies

The first tool has been used to "retire" accounts that have preferred
email addresses without a matching external id. The second is being used
to make sense of whether or not we can do a bulk retirement of accounts
with email conflicts in their external ids. The third is a script that
can be used to remove external ids from accounts in bulk based on their
email addresses.

Change-Id: Idf22cfc9f2bac7d3921e006c40faef4585c2d977
This commit is contained in:
Clark Boylan 2021-02-26 15:08:45 -08:00
parent 3f1d67b99f
commit 112cbc6cfe
3 changed files with 426 additions and 0 deletions

View File

@ -0,0 +1,308 @@
# Script to query Gerrit users by email address to debug accounts with email
# address conflicts. The idea here is we'll identify which users are active
# and need proper manipulation to correct and which are inactive and can
# be retired.
#
# The input list of emails can be generated by a gerrit config consistency
# check again external ids.
#
# This script should also identify when accounts are inactive according to
# Gerrit and not just by our "have they pushed or reviewed code in the last
# year metric. Accounts that are already inactive can be safely retired too.
# This script builds and operates on a datastructure that looks like this.
# john.doe@example.com:
# 1234:
# active: True
# recently_used: True
# recent_change: '2021-01-23 17:31:25.000000000'
# recent_review: None
# 5678:
# active: False
# recently_used: False
# recent_change: None
# recent_review: '2019-03-05 12:15:34.000000000'
# active:
# - 1234
# inactive
# - 5678
# recently_used:
# - 1234
# nonrecently_used:
# - 5678
import datetime
import json
import getpass
import requests
TIME_FORMAT = '%Y-%m-%d %H:%M:%S.%f'
TODAY = datetime.datetime.now()
DELTAT = datetime.timedelta(days=int(365))
SINCET = TODAY - DELTAT
def query_gerrit(loc, query, auth=None):
# Need to do this authenticated and as admin. Start with first pass just
# normal user, then switch to admin and rerun.
if auth:
loc = 'a/' + loc
r = requests.get('https://review.opendev.org/%s/' % loc,
params=query, auth=auth)
# Strip off the gerrit json prefix
j = json.loads(r.text[5:])
return j
def get_account_detail(account_id, auth=None):
# Need to do this authenticated and as admin. We do this without auth
# for quicker debugging cycles, but proper data should be generated with
# auth.
return query_gerrit('accounts/%s/detail' % account_id, {}, auth)
def get_account_sshkeys(account_id, auth=None):
# Need to do this authenticated and as admin. We do this without auth
# for quicker debugging cycles, but proper data should be generated with
# auth.
try:
sshkeys = query_gerrit('accounts/%s/sshkeys' % account_id, {}, auth)
except json.JSONDecodeError:
if auth:
raise
# This handles lack of auth error above
sshkeys = []
return sshkeys
def get_account_externalids(account_id, auth=None):
# Need to do this authenticated and as admin. We do this without auth
# for quicker debugging cycles, but proper data should be generated with
# auth.
try:
eids = query_gerrit('accounts/%s/external.ids' % account_id, {}, auth)
except json.JSONDecodeError:
if auth:
raise
# This handles lack of auth error above
eids = []
return eids
def recently_used(timestamp):
# Gerrit apparently gives us nanoseconds which we can't parse.
timestamp = timestamp[:-3]
activity = datetime.datetime.strptime(timestamp, TIME_FORMAT)
if TODAY - activity < DELTAT:
# We decide the account was recently used if it has reviewed or
# pushed code within the last year.
return True
else:
return False
def read_email_list():
with open('email_list.txt') as f:
users = {}
for email in f:
users[email.strip()] = {}
return users
def check_recent_changes(account_id, account_info, auth):
# Gerrit appears to do a reverse sort giving you the newest results
# first. Since we only care about the most recent activity we set
# n = 1 here.
#query = {'q': 'owner:%s after:%s' % (account_id, SINCET.strftime('%Y-%m-%d')), 'n': 1}
query = {'q': 'owner:%s' % account_id, 'n': 1}
j = query_gerrit('changes', query, auth)
if j:
account_info['recent_change'] = j[0]['updated']
if recently_used(account_info['recent_change']):
account_info['recently_used'] = True
else:
account_info['recent_change'] = None
#query = {'q': 'reviewedby:%s after:%s' % (account_id, SINCET.strftime('%Y-%m-%d')), 'n': 1}
query = {'q': 'reviewedby:%s' % account_id, 'n': 1}
j = query_gerrit('changes', query, auth)
if j:
account_info['recent_review'] = j[0]['updated']
if recently_used(account_info['recent_review']):
account_info['recently_used'] = True
else:
account_info['recent_review'] = None
def get_user_activity(users, auth=None):
for email in users.keys():
users[email]['active'] = []
users[email]['inactive'] = []
users[email]['recently_used'] = []
users[email]['nonrecently_used'] = []
active_query = {'q': 'email:%s is:active' % email}
active_j = query_gerrit('accounts', active_query, auth)
inactive_query = {'q': 'email:%s is:inactive' % email}
inactive_j = query_gerrit('accounts', inactive_query, auth)
if len(active_j + inactive_j) < 2:
# Using an admin account to query this info seems to address
# this problem, but we'll leave this here as a double check.
print("Email %s only has one account" % email)
continue
for account in active_j:
account_id = str(account['_account_id'])
users[email][account_id] = {'recently_used': False,
'active': True,
'username': None,
'sshkeys': None,
'openids': []}
users[email]['active'].append(account_id)
detail = get_account_detail(account_id, auth)
if 'username' in detail:
users[email][account_id]['username'] = detail['username']
sshkeys = get_account_sshkeys(account_id, auth)
if sshkeys:
users[email][account_id]['sshkeys'] = True
eids = get_account_externalids(account_id, auth)
for eid in eids:
# We only care about login.ubuntu urls now
if 'login.ubuntu' in eid['identity']:
r = requests.head(eid['identity'])
if r.status_code == 200:
# If there is an openid and it is valid we add it
# to the list of valid openids
users[email][account_id]['openids'].append(eid['identity'])
check_recent_changes(account_id, users[email][account_id], auth)
if users[email][account_id]['recently_used']:
users[email]['recently_used'].append(account_id)
else:
users[email]['nonrecently_used'].append(account_id)
for account in inactive_j:
account_id = str(account['_account_id'])
users[email][account_id] = {'recently_used': False,
'active': False,
'username': None,
'sshkeys': None,
'openids': []}
users[email]['inactive'].append(account_id)
detail = get_account_detail(account_id, auth)
if 'username' in detail:
users[email][account_id]['username'] = detail['username']
sshkeys = get_account_sshkeys(account_id, auth)
if sshkeys:
users[email][account_id]['sshkeys'] = True
eids = get_account_externalids(account_id, auth)
for eid in eids:
# We only care about login.ubuntu urls now
if 'login.ubuntu' in eid['identity']:
r = requests.head(eid['identity'])
if r.status_code == 200:
# If there is an openid and it is valid we add it
# to the list of valid openids
users[email][account_id]['openids'].append(eid['identity'])
check_recent_changes(account_id, users[email][account_id], auth)
if users[email][account_id]['recently_used']:
users[email]['recently_used'].append(account_id)
else:
users[email]['nonrecently_used'].append(account_id)
if __name__ == '__main__':
query_user = input('Username: ')
query_pass = getpass.getpass('Password: ')
if query_user and query_pass:
auth = (query_user, query_pass)
else:
auth = None
users = read_email_list()
get_user_activity(users, auth=auth)
# TODO there are probably better ways to present this data.
print()
print('Users with inactive accounts. We may just be able to retire these.'
'\nThen remove their external ids.')
print('Email active accounts|inactive accounts')
for email in users:
if users[email]['inactive']:
print(email + ' ' + ','.join(users[email]['active']) + '|'
+ ','.join(users[email]['inactive']))
print()
print('Users without username, ssh keys, valid openid, and no changes or reviews')
print('Email accounts with creds or activity|accounts without creds or activity')
for email in users:
all_accounts = users[email]['recently_used'] + users[email]['nonrecently_used']
accounts_with_creds = []
accounts_without_creds = []
for account_id in all_accounts:
if (not users[email][account_id]['username'] and
not users[email][account_id]['sshkeys'] and
not users[email][account_id]['recent_change'] and
not users[email][account_id]['recent_review'] and
not users[email][account_id]['openids']):
accounts_without_creds.append(account_id)
else:
accounts_with_creds.append(account_id)
if accounts_without_creds:
print(email + ' ' +
','.join(accounts_with_creds) + '|' +
','.join(accounts_without_creds))
print()
print('Users without username, sshkeys and zero changes pushed or reviews')
print('Email accounts with usage|accounts without usage')
for email in users:
all_accounts = users[email]['recently_used'] + users[email]['nonrecently_used']
accounts_with_usage = []
accounts_without_usage = []
for account_id in all_accounts:
if (not users[email][account_id]['username'] and
not users[email][account_id]['sshkeys'] and
not users[email][account_id]['recent_change'] and
not users[email][account_id]['recent_review']):
accounts_without_usage.append(account_id)
else:
accounts_with_usage.append(account_id)
if accounts_without_usage:
print(email + ' ' +
','.join(accounts_with_usage) + '|' +
','.join(accounts_without_usage))
print()
print('Non recently used Users without username or ssh keys')
print('Email accounts with creds|accounts without creds')
for email in users:
if not users[email]['recently_used'] and users[email]['nonrecently_used']:
accounts_with_creds = []
accounts_without_creds = []
for account_id in users[email]['nonrecently_used']:
if not users[email][account_id]['username'] and \
not users[email][account_id]['sshkeys']:
accounts_without_creds.append(account_id)
else:
accounts_with_creds.append(account_id)
if not accounts_with_creds == users[email]['nonrecently_used']:
print(email + ' ' +
','.join(accounts_with_creds) + '|' +
','.join(accounts_without_creds))
print()
print('Non recently used Users')
print('Email non recent accounts')
for email in users:
if not users[email]['recently_used'] and users[email]['nonrecently_used']:
print(email + ' ' + ','.join(users[email]['nonrecently_used']))
print()
print('Recently used Users')
print('Email recent accounts|nonrecent accounts')
for email in users:
if users[email]['recently_used']:
print(email + ' ' + ','.join(users[email]['recently_used']) + '|'
+ ','.join(users[email]['nonrecently_used']))
print()
print('Emails that need further investigation')
for email in users:
if not users[email]['recently_used'] and not users[email]['nonrecently_used']:
print(email)

View File

@ -0,0 +1,71 @@
# This script reads a file with this format:
#
# email_addr account_id
#
# It will then remove all external ids with that email addr
# in them from the account specified.
# Note the account_ids and emails both may be non unique depending
# on the gerrit account situation. We iterate over each line in this
# file one at a time to avoid problems with deduping in datastructures.
import getpass
import json
import requests
def get_external_ids(account_id, auth):
r = requests.get('https://review.opendev.org'
'/a/accounts/%s/external.ids' % account_id,
auth=auth)
# Strip off the gerrit json prefix
j = json.loads(r.text[5:])
return j
def is_active(account_id, auth):
r = requests.get('https://review.opendev.org'
'/a/accounts/%s/detail' % account_id,
auth=auth)
# Strip off the gerrit json prefix
j = json.loads(r.text[5:])
if 'inactive' in j and j['inactive']:
return False
else:
return True
if __name__ == '__main__':
query_user = input('Username: ')
query_pass = getpass.getpass('Password: ')
if query_user and query_pass:
auth = (query_user, query_pass)
else:
print("This script requires authentication")
exit(1)
with open('external_id_cleanups.txt') as f:
for line in f:
(email, account_id) = line.strip().split()
print(email + ' ' + account_id)
if is_active(account_id, auth):
print('This account is active. Skipping.')
continue
j = get_external_ids(account_id, auth)
print('external IDs: ' + str(j))
eids_to_remove = []
for eid in j:
if 'email_address' in eid and eid['email_address'] == email:
eids_to_remove.append(eid['identity'])
if eids_to_remove:
print('Removing these external IDs: ' + str(eids_to_remove))
url = 'https://review.opendev.org' \
'/a/accounts/%s/external.ids:delete' % account_id
print(url)
r = requests.post(url, json=eids_to_remove, auth=auth)
print(r.status_code)
print(r.text)
else:
print('No matching external ids')

View File

@ -0,0 +1,47 @@
# Script to "retire" a gerrit account given its All-Users ref, eg:
# refs/users/34/1234
# This script should be run within the root of an All-Users repo.
#
# This will remove the preferred email from the account to fix
# issues where the preferred email has no corresponding external id
# and set the account to inactive.
#
# The commit message heredoc should be edited appropriately before
# running this script.
set -ex
REF=$1
git fetch origin $REF
git checkout FETCH_HEAD
sed -i -e '/^\tpreferredEmail = .*/d' account.config
# Gerrit accounts are active by default and don't have active record
# entries when active.
if ! grep 'active = false' account.config ; then
echo -e "\tactive = false" >> account.config
fi
git add account.config
git commit -F - << EOF
Retire this account
Set the account to inactive and remove its preferred email address.
This account appears to be an old style third party CI account. One
which the Gerrit admins manually added it as a system account. For
a while now we've asked third party CI operators to transition to
openid based accounts to reduce our workload. These third party CI
systems don't appear currently active and retiring them will fix
Gerrit consistency errors. If necessary they can create more modern
openid based accounts for their CI systems.
We are doing this to fix these Gerrit consistency errors:
Account 'ABXY' has no external ID for its preferred email 'ABXY@example.com'
EOF
#echo '## Verify this commit is correct with git show HEAD'
#echo "## If things look good run git push origin HEAD:$REF"
git show HEAD
git push origin HEAD:$REF