584 lines
22 KiB
Python
Executable File
584 lines
22 KiB
Python
Executable File
# Copyright OpenDev Contributors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an "AS
|
|
# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
|
# express or implied. See the License for the specific language
|
|
# governing permissions and limitations under the License.
|
|
|
|
import csv
|
|
import datetime
|
|
import html.parser
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import urllib.parse
|
|
|
|
import requests
|
|
import yaml
|
|
|
|
|
|
def requester(
|
|
url, params={}, headers={}, mode='live', recording=None, verbose=0):
|
|
"""A requests wrapper to consistently retry HTTPS queries"""
|
|
|
|
# We key recordings of queries based on a tuple of their URL and parameters
|
|
# (this may not be stable in Python<3.6 due to lack of dict ordering)
|
|
if mode == 'replay':
|
|
# In replay mode, use recorded results for all queries
|
|
text = recording[(url, params)]
|
|
else:
|
|
# In live or record modes, actually use the remote API instead
|
|
retry = requests.Session()
|
|
retry.mount("https://", requests.adapters.HTTPAdapter(max_retries=3))
|
|
response = retry.get(url=url, params=params, headers=headers)
|
|
text = response.text
|
|
if verbose >= 2:
|
|
print("Queried: %s" % response.url)
|
|
if mode == 'record':
|
|
# In record mode, also save a copy of the query results to replay
|
|
recording[(url, params)] = text
|
|
|
|
return text
|
|
|
|
|
|
def decode_json(raw):
|
|
"""Trap JSON decoding failures and provide more detailed errors"""
|
|
|
|
# Gerrit's REST API prepends a JSON-breaker to avoid XSS vulnerabilities
|
|
if raw.startswith(")]}'"):
|
|
trimmed = raw[4:]
|
|
else:
|
|
trimmed = raw
|
|
|
|
# Try to decode and bail with much detail if it fails
|
|
try:
|
|
decoded = json.loads(trimmed)
|
|
except Exception:
|
|
print('\nrequest returned %s error to query:\n\n %s\n'
|
|
'\nwith detail:\n\n %s\n' % (raw, raw.url, trimmed),
|
|
file=sys.stderr)
|
|
raise
|
|
return decoded
|
|
|
|
|
|
def query_gerrit(method, params={}, mode='live', recording=None, verbose=0):
|
|
"""Query the Gerrit REST API and make or replay a recording"""
|
|
|
|
url = 'https://review.opendev.org/%s' % method
|
|
result = requester(
|
|
url,
|
|
params=params,
|
|
headers={'Accept': 'application/json'},
|
|
mode=mode,
|
|
recording=recording,
|
|
verbose=verbose)
|
|
return decode_json(result)
|
|
|
|
|
|
def from_gerrit_time(when):
|
|
"""Translate a Gerrit date/time string into a naive datetime object."""
|
|
|
|
return datetime.datetime.strptime(when.split('.')[0], '%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
def to_gerrit_time(when):
|
|
"""Translate a datetime object into a Gerrit date/time string."""
|
|
|
|
return when.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
def get_projects(recording=None, verbose=0):
|
|
"""Return a sorted list of all namespaced code projects in Gerrit"""
|
|
|
|
all_projects = query_gerrit(
|
|
'projects/', params={'type': 'code'}, recording=recording,
|
|
verbose=verbose)
|
|
projects = list()
|
|
for (project, details) in all_projects.items():
|
|
if '/' in project:
|
|
projects.append(project)
|
|
return sorted(projects)
|
|
|
|
|
|
def usage_error():
|
|
"""Write a generic usage message to stderr and exit nonzero"""
|
|
|
|
sys.stderr.write(
|
|
'ERROR: specify report period like YEAR, YEAR-H[1-2], YEAR-Q[1-4],\n'
|
|
' or YEAR-[01-12], optionally prefixed by record- or replay-\n'
|
|
' if you want to make a recording or reuse a prior recording\n')
|
|
sys.exit(1)
|
|
|
|
|
|
def parse_report_period(when):
|
|
"""Parse a supplied report period string, returning a tuple of
|
|
after and before datetime objects"""
|
|
|
|
monthly = re.compile(r'^(\d{4})-(\d{2})$')
|
|
quarterly = re.compile(r'^(\d{4})-q([1-4])$', re.IGNORECASE)
|
|
halfyearly = re.compile(r'^(\d{4})-h([1-4])$', re.IGNORECASE)
|
|
yearly = re.compile(r'^\d{4}$')
|
|
if monthly.match(when):
|
|
start_year = int(monthly.match(when).group(1))
|
|
start_month = int(monthly.match(when).group(2))
|
|
end_year = start_year + start_month // 12
|
|
end_month = 1 + start_month % 12
|
|
elif quarterly.match(when):
|
|
start_year = int(quarterly.match(when).group(1))
|
|
start_month = 1 + 3 * (int(quarterly.match(when).group(2)) - 1)
|
|
end_year = start_year + (start_month + 2) // 12
|
|
end_month = 1 + (start_month + 2) % 12
|
|
elif halfyearly.match(when):
|
|
start_year = int(halfyearly.match(when).group(1))
|
|
start_month = 1 + 6 * (int(halfyearly.match(when).group(2)) - 1)
|
|
end_year = start_year + (start_month + 5) // 12
|
|
end_month = 1 + (start_month + 5) % 12
|
|
elif yearly.match(when):
|
|
start_year = int(yearly.match(when).group())
|
|
start_month = 1
|
|
end_year = start_year + 1
|
|
end_month = 1
|
|
else:
|
|
usage_error()
|
|
after = datetime.datetime(start_year, start_month, 1)
|
|
before = datetime.datetime(end_year, end_month, 1)
|
|
return after, before
|
|
|
|
|
|
def parse_command_line():
|
|
"""Parse the command line to obtain the report period, then return it"""
|
|
|
|
if len(sys.argv) == 2:
|
|
return sys.argv[1]
|
|
else:
|
|
usage_error()
|
|
|
|
|
|
def report_times(report, after, before):
|
|
"""Add timestamp values to provided report"""
|
|
|
|
report['times'] = dict()
|
|
report['times']['after'] = to_gerrit_time(after)
|
|
report['times']['before'] = to_gerrit_time(before)
|
|
report['times']['generated'] = to_gerrit_time(datetime.datetime.utcnow())
|
|
return report
|
|
|
|
|
|
def get_ml_index(verbose=0):
|
|
sites = yaml.safe_load(
|
|
requester('http://lists.opendev.org/archives.yaml', verbose=verbose))
|
|
return sites
|
|
|
|
|
|
def get_ml_archive(listname, site, yearmonth, verbose=0):
|
|
year, month = yearmonth
|
|
monthname = datetime.date(1, month, 1).strftime('%B')
|
|
return requester('http://%s/pipermail/%s/%s-%s.txt' % (
|
|
site,
|
|
listname,
|
|
year,
|
|
monthname,
|
|
), verbose=verbose)
|
|
|
|
|
|
def add_ml_activity(ml_activity, site, archive):
|
|
if archive:
|
|
for line in archive.split('\n'):
|
|
# Take care to avoid incorrectly matching on lines which
|
|
# begin with the word From inside the message body
|
|
fromline = re.match(
|
|
r'From ([^ ]+) at ([0-9A-Za-z\.-]+\.[0-9A-Za-z\.-]+) '
|
|
r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat) '
|
|
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) '
|
|
r'[ 123][0-9] [012][0-9]:[0-9]{2}:[0-9]{2} [0-9]{4}$',
|
|
line)
|
|
if fromline:
|
|
localpart, domainpart = fromline.groups()[:2]
|
|
domainpart = domainpart.lower()
|
|
address = '%s@%s' % (localpart, domainpart)
|
|
if address.lower() in (
|
|
'build.starlingx@gmail.com',
|
|
'hudson@openstack.org',
|
|
'info@bitergia.com',
|
|
'infra-root@openstack.org',
|
|
'jenkins@openstack.org',
|
|
'no-reply@openstack.org',
|
|
'readthedocs@readthedocs.org',
|
|
'review@openstack.org',
|
|
'zuul@opendev.org',
|
|
'zuul@openstack.org',
|
|
) or domainpart in (
|
|
'bugs.launchpad.net',
|
|
'lists.airshipit.org',
|
|
'lists.katacontainers.io',
|
|
'lists.opendev.org',
|
|
'lists.openinfra.dev',
|
|
'lists.openstack.org',
|
|
'lists.starlingx.io',
|
|
'lists.zuul-ci.org',
|
|
'review.opendev.org',
|
|
'review.openstack.org',
|
|
'storyboard.openstack.org',
|
|
'storyboard.opendev.org',
|
|
'zuul.opendev.org',
|
|
'zuul.openstack.org',
|
|
):
|
|
continue
|
|
if address in ml_activity[site]:
|
|
ml_activity[site][address] += 1
|
|
else:
|
|
ml_activity[site][address] = 1
|
|
if address in ml_activity['_total']:
|
|
ml_activity['_total'][address] += 1
|
|
else:
|
|
ml_activity['_total'][address] = 1
|
|
|
|
|
|
def add_all_ml_activity(ml_activity, sites, yearmonth, verbose=0):
|
|
for site in sites:
|
|
if site not in ml_activity:
|
|
ml_activity[site] = {}
|
|
for listname in sites[site]:
|
|
archive = get_ml_archive(
|
|
listname, site, yearmonth, verbose=verbose)
|
|
add_ml_activity(ml_activity, site, archive)
|
|
|
|
|
|
class ChannelsListParser(html.parser.HTMLParser):
|
|
def __init__(self):
|
|
self.channels = list()
|
|
super().__init__()
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == 'a' and attrs[0][1].startswith('%23'):
|
|
self.channels.append(urllib.parse.unquote(attrs[0][1].strip('/')))
|
|
|
|
|
|
def get_channels_list(verbose=0):
|
|
parser = ChannelsListParser()
|
|
parser.feed(
|
|
requester('https://meetings.opendev.org/irclogs/', verbose=verbose))
|
|
return parser.channels
|
|
|
|
|
|
class LogsListParser(html.parser.HTMLParser):
|
|
def __init__(self):
|
|
self.logs = list()
|
|
super().__init__()
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == 'a' and attrs[0][1].startswith('%23'):
|
|
self.logs.append(attrs[0][1])
|
|
|
|
|
|
def get_channel_logs(channel, yearmonth, verbose=0):
|
|
year, month = yearmonth
|
|
channel = urllib.parse.quote(channel)
|
|
logs = ''
|
|
parser = LogsListParser()
|
|
parser.feed(requester(
|
|
'https://meetings.opendev.org/irclogs/%s/' % channel, verbose=verbose))
|
|
for day in range(1, 32):
|
|
if '%s.%d-%02d-%02d.log.html' % (
|
|
channel, year, month, day) not in parser.logs:
|
|
continue
|
|
result = requester(
|
|
'https://meetings.opendev.org/irclogs/%s/%s.%d-%02d-%02d.log' % (
|
|
channel,
|
|
channel,
|
|
year,
|
|
month,
|
|
day,
|
|
), verbose=verbose)
|
|
if result:
|
|
logs += result
|
|
return logs.split('\n')
|
|
|
|
|
|
def add_chat_activity(chat_activity, logs, namespace, verbose=0):
|
|
if logs:
|
|
for line in logs.split('\n'):
|
|
logline = re.match(r'[0-9T:-]{19} <([^ ]+)> ', line)
|
|
if logline:
|
|
nick = logline.group(1).strip('@')
|
|
if nick in (
|
|
'opendevmeet',
|
|
'opendevreview',
|
|
'opendevstatus',
|
|
'openstack',
|
|
'openstackgerrit',
|
|
'openstackstatus',
|
|
):
|
|
continue
|
|
if namespace not in chat_activity:
|
|
chat_activity[namespace] = {}
|
|
if verbose >= 1:
|
|
print("Adding namespace: %s" % namespace)
|
|
if nick in chat_activity['_all_channels']:
|
|
chat_activity['_all_channels'][nick] += 1
|
|
else:
|
|
chat_activity['_all_channels'][nick] = 1
|
|
if verbose >= 1:
|
|
print("Found chat nick: %s" % nick)
|
|
if nick in chat_activity[namespace]:
|
|
chat_activity[namespace][nick] += 1
|
|
else:
|
|
chat_activity[namespace][nick] = 1
|
|
|
|
|
|
def main(verbose=0):
|
|
"""Utility entry point"""
|
|
|
|
argument = parse_command_line()
|
|
if argument.startswith('record-'):
|
|
mode = 'record'
|
|
argument = argument[len(mode)+1:]
|
|
elif argument.startswith('replay-'):
|
|
mode = 'replay'
|
|
argument = argument[len(mode)+1:]
|
|
else:
|
|
mode = 'live'
|
|
|
|
recordfn = 'recordings/%s.yaml' % argument
|
|
if mode == 'record':
|
|
recording = {}
|
|
elif mode == 'replay':
|
|
recording = yaml.load(open(recordfn), loader=yaml.loader.safeLoader)
|
|
else:
|
|
recording = None
|
|
|
|
after, before = parse_report_period(argument)
|
|
changes = dict()
|
|
|
|
# Shard querying by project, to help with the inherent instability of
|
|
# result pagination from the Gerrit API
|
|
for project in get_projects(recording=recording, verbose=verbose):
|
|
if verbose >= 1:
|
|
print("Checking project: %s" % project)
|
|
offset = 0
|
|
# Loop due to unavoidable query result pagination
|
|
while offset >= 0:
|
|
# We only constrain the query by the after date, as changes created
|
|
# between the before and after date may have been updated more
|
|
# recently with a new revision or comment
|
|
new_changes = query_gerrit('changes/', params={
|
|
'q': 'project:%s after:{%s}' % (
|
|
project, to_gerrit_time(after)),
|
|
'no-limit': '1',
|
|
'start': offset,
|
|
'o': ['ALL_REVISIONS', 'MESSAGES', 'SKIP_DIFFSTAT'],
|
|
}, recording=recording, verbose=verbose)
|
|
# Since we redundantly query ranges with offsets to help combat
|
|
# pagination instability, we must deduplicate results
|
|
for change in new_changes:
|
|
if change['id'] not in changes:
|
|
changes[change['id']] = change
|
|
# Offset additional pages by half the returned entry count to help
|
|
# avoid missing changes due to pagination instability
|
|
if new_changes and new_changes[-1].get('_more_changes', False):
|
|
offset += len(new_changes)/2
|
|
else:
|
|
offset = -1
|
|
|
|
report = {
|
|
'chat_namespaces': dict(),
|
|
'ml_sites': dict(),
|
|
'repo_namespaces': dict(),
|
|
}
|
|
report_times(report, after, before)
|
|
committers = dict()
|
|
projects_active = dict()
|
|
reviewers = dict()
|
|
for change in changes.values():
|
|
namespace = change['project'].split("/")[0]
|
|
if namespace not in report['repo_namespaces']:
|
|
report['repo_namespaces'][namespace] = {
|
|
'changes_created': 0,
|
|
'changes_merged': 0,
|
|
'review_automated': 0,
|
|
'reviewer_messages': 0,
|
|
'revisions_pushed': 0,
|
|
}
|
|
if namespace not in projects_active:
|
|
projects_active[namespace] = set()
|
|
if after < from_gerrit_time(change['created']) < before:
|
|
# Note that the changes are not returned in chronological
|
|
# order, so we have to test all of them and can't short-circuit
|
|
# after the first change which was created too late
|
|
report['repo_namespaces'][namespace]['changes_created'] += 1
|
|
projects_active[namespace].add(change['project'])
|
|
if namespace not in committers:
|
|
committers[namespace] = set()
|
|
committers[namespace].add(change['owner']['_account_id'])
|
|
if verbose >= 2:
|
|
print("Found created change: %s" % change['_number'])
|
|
if ('submitted' in change and after < from_gerrit_time(
|
|
change['submitted']) < before):
|
|
report['repo_namespaces'][namespace]['changes_merged'] += 1
|
|
projects_active[namespace].add(change['project'])
|
|
if verbose >= 2:
|
|
print("Found merged change: %s" % change['_number'])
|
|
for revision in change['revisions'].values():
|
|
if after < from_gerrit_time(revision['created']) < before:
|
|
report['repo_namespaces'][namespace]['revisions_pushed'] += 1
|
|
projects_active[namespace].add(change['project'])
|
|
if namespace not in committers:
|
|
committers[namespace] = set()
|
|
committers[namespace].add(revision['uploader']['_account_id'])
|
|
if verbose >= 2:
|
|
print("Found change revision: %s,%s" % (
|
|
change['_number'], revision['_number']))
|
|
for message in change['messages']:
|
|
if after < from_gerrit_time(message['date']) < before:
|
|
if ('tag' in message and message['tag'].startswith(
|
|
'autogenerated:')):
|
|
report['repo_namespaces'][namespace][
|
|
'review_automated'] += 1
|
|
projects_active[namespace].add(change['project'])
|
|
if verbose >= 2:
|
|
print("Found automated comment: %s,%s,%s (%s)" % (
|
|
change['_number'],
|
|
message['_revision_number'],
|
|
message['id'],
|
|
message['date']))
|
|
elif not message['message'].startswith(
|
|
'Uploaded patch set'):
|
|
report['repo_namespaces'][namespace][
|
|
'reviewer_messages'] += 1
|
|
projects_active[namespace].add(change['project'])
|
|
if namespace not in reviewers:
|
|
reviewers[namespace] = set()
|
|
reviewers[namespace].add(message['author']['_account_id'])
|
|
if verbose >= 2:
|
|
print("Found reviewer comment: %s,%s,%s (%s)" % (
|
|
change['_number'],
|
|
message['_revision_number'],
|
|
message['id'],
|
|
message['date']))
|
|
all_committers = set()
|
|
for namespace in committers:
|
|
report['repo_namespaces'][namespace]['committers'] = len(
|
|
committers[namespace])
|
|
all_committers = all_committers.union(committers[namespace])
|
|
all_reviewers = set()
|
|
for namespace in reviewers:
|
|
report['repo_namespaces'][namespace]['reviewers'] = len(
|
|
reviewers[namespace])
|
|
all_reviewers = all_reviewers.union(reviewers[namespace])
|
|
for namespace in projects_active:
|
|
report['repo_namespaces'][namespace]['projects_active'] = len(
|
|
projects_active[namespace])
|
|
|
|
ml_activity = {'_total': {}}
|
|
for scalar_month in range(
|
|
after.year * 12 + after.month,
|
|
before.year * 12 + before.month):
|
|
yearmonth = ((scalar_month - 1) // 12, scalar_month % 12 or 12)
|
|
add_all_ml_activity(
|
|
ml_activity, get_ml_index(), yearmonth, verbose=verbose)
|
|
report['ml_sites'] = {}
|
|
for site in ml_activity:
|
|
report['ml_sites'][site] = {'posts': 0, 'senders': 0}
|
|
for posts in ml_activity[site].values():
|
|
report['ml_sites'][site]['posts'] += posts
|
|
report['ml_sites'][site]['senders'] += 1
|
|
|
|
chat_activity = {'_all_channels': {}}
|
|
channels = get_channels_list(verbose=verbose)
|
|
for channel in channels:
|
|
namespace = channel.split('-')[0].strip('#')
|
|
for scalar_month in range(
|
|
after.year * 12 + after.month,
|
|
before.year * 12 + before.month):
|
|
yearmonth = ((scalar_month - 1) // 12, scalar_month % 12 or 12)
|
|
for logs in get_channel_logs(channel, yearmonth, verbose=verbose):
|
|
add_chat_activity(
|
|
chat_activity, logs, namespace, verbose=verbose)
|
|
for namespace in chat_activity:
|
|
report['chat_namespaces'][namespace] = {
|
|
'messages': sum(chat_activity[namespace].values()),
|
|
'nicks': len(chat_activity[namespace]),
|
|
}
|
|
|
|
report['totals'] = dict()
|
|
report['totals']['active_repo_namespaces'] = len(report['repo_namespaces'])
|
|
report['totals']['committers'] = len(all_committers)
|
|
report['totals']['reviewers'] = len(all_reviewers)
|
|
additive_keys = (
|
|
'changes_created',
|
|
'changes_merged',
|
|
'projects_active',
|
|
'review_automated',
|
|
'reviewer_messages',
|
|
'revisions_pushed',
|
|
)
|
|
for key in additive_keys:
|
|
report['totals'][key] = 0
|
|
# Operate on a copy of the keys since we'll be altering the dict
|
|
for namespace in list(report['repo_namespaces'].keys()):
|
|
# Cull inactive namespaces from the report
|
|
if not report['repo_namespaces'][namespace]['projects_active']:
|
|
del report['repo_namespaces'][namespace]
|
|
continue
|
|
# Summation key totals
|
|
for key in additive_keys:
|
|
report['totals'][key] += report['repo_namespaces'][namespace][key]
|
|
|
|
report['totals']['mailing_list_posts'] = (
|
|
report['ml_sites']['_total']['posts'])
|
|
report['totals']['mailing_list_senders'] = (
|
|
report['ml_sites']['_total']['senders'])
|
|
del report['ml_sites']['_total']
|
|
report['totals']['mailing_list_sites'] = len(report['ml_sites'])
|
|
|
|
report['totals']['chat_messages_logged'] = sum(
|
|
chat_activity['_all_channels'].values())
|
|
report['totals']['chat_nicknames_logged'] = len(
|
|
chat_activity['_all_channels'])
|
|
del report['chat_namespaces']['_all_channels']
|
|
report['totals']['chat_channel_namespaces'] = len(
|
|
report['chat_namespaces'])
|
|
|
|
# Write a recording if requested
|
|
if mode == 'record':
|
|
os.makedirs(os.path.dirname(recordfn), exist_ok=True)
|
|
open(recordfn, 'w').write(yaml.dump(recording))
|
|
|
|
# Write the full YAML structured data report
|
|
os.makedirs('reports', exist_ok=True)
|
|
open('reports/%s.yaml' % argument, 'w').write(yaml.dump(report))
|
|
|
|
# Write the one-dimensional CSV tabular reports
|
|
for tabname in ('times', 'totals'):
|
|
table = [[argument, tabname]]
|
|
for rowname in report[tabname]:
|
|
table.append([rowname, report[tabname][rowname]])
|
|
csv.writer(open('reports/%s_%s.csv' % (
|
|
argument, tabname), 'w')).writerows(table)
|
|
|
|
# Write the two-dimensional CSV tabular reports
|
|
for tabname in ('chat_namespaces', 'ml_sites', 'repo_namespaces'):
|
|
table = [[argument]]
|
|
for colname in report[tabname]:
|
|
table[0].append(colname)
|
|
for rowname in report[tabname][colname]:
|
|
row_updated = False
|
|
for row in table[1:]:
|
|
if row[0] == rowname:
|
|
row.append(report[tabname][colname][rowname])
|
|
row_updated = True
|
|
break
|
|
if not row_updated:
|
|
table.append(
|
|
[rowname, report[tabname][colname][rowname]])
|
|
csv.writer(open('reports/%s_%s.csv' % (
|
|
argument, tabname), 'w')).writerows(table)
|