engagement/engagement/stats.py

584 lines
22 KiB
Python
Executable File

# Copyright OpenDev Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS
# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language
# governing permissions and limitations under the License.
import csv
import datetime
import html.parser
import json
import os
import re
import sys
import urllib.parse
import requests
import yaml
def requester(
url, params={}, headers={}, mode='live', recording=None, verbose=0):
"""A requests wrapper to consistently retry HTTPS queries"""
# We key recordings of queries based on a tuple of their URL and parameters
# (this may not be stable in Python<3.6 due to lack of dict ordering)
if mode == 'replay':
# In replay mode, use recorded results for all queries
text = recording[(url, params)]
else:
# In live or record modes, actually use the remote API instead
retry = requests.Session()
retry.mount("https://", requests.adapters.HTTPAdapter(max_retries=3))
response = retry.get(url=url, params=params, headers=headers)
text = response.text
if verbose >= 2:
print("Queried: %s" % response.url)
if mode == 'record':
# In record mode, also save a copy of the query results to replay
recording[(url, params)] = text
return text
def decode_json(raw):
"""Trap JSON decoding failures and provide more detailed errors"""
# Gerrit's REST API prepends a JSON-breaker to avoid XSS vulnerabilities
if raw.startswith(")]}'"):
trimmed = raw[4:]
else:
trimmed = raw
# Try to decode and bail with much detail if it fails
try:
decoded = json.loads(trimmed)
except Exception:
print('\nrequest returned %s error to query:\n\n %s\n'
'\nwith detail:\n\n %s\n' % (raw, raw.url, trimmed),
file=sys.stderr)
raise
return decoded
def query_gerrit(method, params={}, mode='live', recording=None, verbose=0):
"""Query the Gerrit REST API and make or replay a recording"""
url = 'https://review.opendev.org/%s' % method
result = requester(
url,
params=params,
headers={'Accept': 'application/json'},
mode=mode,
recording=recording,
verbose=verbose)
return decode_json(result)
def from_gerrit_time(when):
"""Translate a Gerrit date/time string into a naive datetime object."""
return datetime.datetime.strptime(when.split('.')[0], '%Y-%m-%d %H:%M:%S')
def to_gerrit_time(when):
"""Translate a datetime object into a Gerrit date/time string."""
return when.strftime('%Y-%m-%d %H:%M:%S')
def get_projects(recording=None, verbose=0):
"""Return a sorted list of all namespaced code projects in Gerrit"""
all_projects = query_gerrit(
'projects/', params={'type': 'code'}, recording=recording,
verbose=verbose)
projects = list()
for (project, details) in all_projects.items():
if '/' in project:
projects.append(project)
return sorted(projects)
def usage_error():
"""Write a generic usage message to stderr and exit nonzero"""
sys.stderr.write(
'ERROR: specify report period like YEAR, YEAR-H[1-2], YEAR-Q[1-4],\n'
' or YEAR-[01-12], optionally prefixed by record- or replay-\n'
' if you want to make a recording or reuse a prior recording\n')
sys.exit(1)
def parse_report_period(when):
"""Parse a supplied report period string, returning a tuple of
after and before datetime objects"""
monthly = re.compile(r'^(\d{4})-(\d{2})$')
quarterly = re.compile(r'^(\d{4})-q([1-4])$', re.IGNORECASE)
halfyearly = re.compile(r'^(\d{4})-h([1-4])$', re.IGNORECASE)
yearly = re.compile(r'^\d{4}$')
if monthly.match(when):
start_year = int(monthly.match(when).group(1))
start_month = int(monthly.match(when).group(2))
end_year = start_year + start_month // 12
end_month = 1 + start_month % 12
elif quarterly.match(when):
start_year = int(quarterly.match(when).group(1))
start_month = 1 + 3 * (int(quarterly.match(when).group(2)) - 1)
end_year = start_year + (start_month + 2) // 12
end_month = 1 + (start_month + 2) % 12
elif halfyearly.match(when):
start_year = int(halfyearly.match(when).group(1))
start_month = 1 + 6 * (int(halfyearly.match(when).group(2)) - 1)
end_year = start_year + (start_month + 5) // 12
end_month = 1 + (start_month + 5) % 12
elif yearly.match(when):
start_year = int(yearly.match(when).group())
start_month = 1
end_year = start_year + 1
end_month = 1
else:
usage_error()
after = datetime.datetime(start_year, start_month, 1)
before = datetime.datetime(end_year, end_month, 1)
return after, before
def parse_command_line():
"""Parse the command line to obtain the report period, then return it"""
if len(sys.argv) == 2:
return sys.argv[1]
else:
usage_error()
def report_times(report, after, before):
"""Add timestamp values to provided report"""
report['times'] = dict()
report['times']['after'] = to_gerrit_time(after)
report['times']['before'] = to_gerrit_time(before)
report['times']['generated'] = to_gerrit_time(datetime.datetime.utcnow())
return report
def get_ml_index(verbose=0):
sites = yaml.safe_load(
requester('http://lists.opendev.org/archives.yaml', verbose=verbose))
return sites
def get_ml_archive(listname, site, yearmonth, verbose=0):
year, month = yearmonth
monthname = datetime.date(1, month, 1).strftime('%B')
return requester('http://%s/pipermail/%s/%s-%s.txt' % (
site,
listname,
year,
monthname,
), verbose=verbose)
def add_ml_activity(ml_activity, site, archive):
if archive:
for line in archive.split('\n'):
# Take care to avoid incorrectly matching on lines which
# begin with the word From inside the message body
fromline = re.match(
r'From ([^ ]+) at ([0-9A-Za-z\.-]+\.[0-9A-Za-z\.-]+) '
r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat) '
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) '
r'[ 123][0-9] [012][0-9]:[0-9]{2}:[0-9]{2} [0-9]{4}$',
line)
if fromline:
localpart, domainpart = fromline.groups()[:2]
domainpart = domainpart.lower()
address = '%s@%s' % (localpart, domainpart)
if address.lower() in (
'build.starlingx@gmail.com',
'hudson@openstack.org',
'info@bitergia.com',
'infra-root@openstack.org',
'jenkins@openstack.org',
'no-reply@openstack.org',
'readthedocs@readthedocs.org',
'review@openstack.org',
'zuul@opendev.org',
'zuul@openstack.org',
) or domainpart in (
'bugs.launchpad.net',
'lists.airshipit.org',
'lists.katacontainers.io',
'lists.opendev.org',
'lists.openinfra.dev',
'lists.openstack.org',
'lists.starlingx.io',
'lists.zuul-ci.org',
'review.opendev.org',
'review.openstack.org',
'storyboard.openstack.org',
'storyboard.opendev.org',
'zuul.opendev.org',
'zuul.openstack.org',
):
continue
if address in ml_activity[site]:
ml_activity[site][address] += 1
else:
ml_activity[site][address] = 1
if address in ml_activity['_total']:
ml_activity['_total'][address] += 1
else:
ml_activity['_total'][address] = 1
def add_all_ml_activity(ml_activity, sites, yearmonth, verbose=0):
for site in sites:
if site not in ml_activity:
ml_activity[site] = {}
for listname in sites[site]:
archive = get_ml_archive(
listname, site, yearmonth, verbose=verbose)
add_ml_activity(ml_activity, site, archive)
class ChannelsListParser(html.parser.HTMLParser):
def __init__(self):
self.channels = list()
super().__init__()
def handle_starttag(self, tag, attrs):
if tag == 'a' and attrs[0][1].startswith('%23'):
self.channels.append(urllib.parse.unquote(attrs[0][1].strip('/')))
def get_channels_list(verbose=0):
parser = ChannelsListParser()
parser.feed(
requester('https://meetings.opendev.org/irclogs/', verbose=verbose))
return parser.channels
class LogsListParser(html.parser.HTMLParser):
def __init__(self):
self.logs = list()
super().__init__()
def handle_starttag(self, tag, attrs):
if tag == 'a' and attrs[0][1].startswith('%23'):
self.logs.append(attrs[0][1])
def get_channel_logs(channel, yearmonth, verbose=0):
year, month = yearmonth
channel = urllib.parse.quote(channel)
logs = ''
parser = LogsListParser()
parser.feed(requester(
'https://meetings.opendev.org/irclogs/%s/' % channel, verbose=verbose))
for day in range(1, 32):
if '%s.%d-%02d-%02d.log.html' % (
channel, year, month, day) not in parser.logs:
continue
result = requester(
'https://meetings.opendev.org/irclogs/%s/%s.%d-%02d-%02d.log' % (
channel,
channel,
year,
month,
day,
), verbose=verbose)
if result:
logs += result
return logs.split('\n')
def add_chat_activity(chat_activity, logs, namespace, verbose=0):
if logs:
for line in logs.split('\n'):
logline = re.match(r'[0-9T:-]{19} <([^ ]+)> ', line)
if logline:
nick = logline.group(1).strip('@')
if nick in (
'opendevmeet',
'opendevreview',
'opendevstatus',
'openstack',
'openstackgerrit',
'openstackstatus',
):
continue
if namespace not in chat_activity:
chat_activity[namespace] = {}
if verbose >= 1:
print("Adding namespace: %s" % namespace)
if nick in chat_activity['_all_channels']:
chat_activity['_all_channels'][nick] += 1
else:
chat_activity['_all_channels'][nick] = 1
if verbose >= 1:
print("Found chat nick: %s" % nick)
if nick in chat_activity[namespace]:
chat_activity[namespace][nick] += 1
else:
chat_activity[namespace][nick] = 1
def main(verbose=0):
"""Utility entry point"""
argument = parse_command_line()
if argument.startswith('record-'):
mode = 'record'
argument = argument[len(mode)+1:]
elif argument.startswith('replay-'):
mode = 'replay'
argument = argument[len(mode)+1:]
else:
mode = 'live'
recordfn = 'recordings/%s.yaml' % argument
if mode == 'record':
recording = {}
elif mode == 'replay':
recording = yaml.load(open(recordfn), loader=yaml.loader.safeLoader)
else:
recording = None
after, before = parse_report_period(argument)
changes = dict()
# Shard querying by project, to help with the inherent instability of
# result pagination from the Gerrit API
for project in get_projects(recording=recording, verbose=verbose):
if verbose >= 1:
print("Checking project: %s" % project)
offset = 0
# Loop due to unavoidable query result pagination
while offset >= 0:
# We only constrain the query by the after date, as changes created
# between the before and after date may have been updated more
# recently with a new revision or comment
new_changes = query_gerrit('changes/', params={
'q': 'project:%s after:{%s}' % (
project, to_gerrit_time(after)),
'no-limit': '1',
'start': offset,
'o': ['ALL_REVISIONS', 'MESSAGES', 'SKIP_DIFFSTAT'],
}, recording=recording, verbose=verbose)
# Since we redundantly query ranges with offsets to help combat
# pagination instability, we must deduplicate results
for change in new_changes:
if change['id'] not in changes:
changes[change['id']] = change
# Offset additional pages by half the returned entry count to help
# avoid missing changes due to pagination instability
if new_changes and new_changes[-1].get('_more_changes', False):
offset += len(new_changes)/2
else:
offset = -1
report = {
'chat_namespaces': dict(),
'ml_sites': dict(),
'repo_namespaces': dict(),
}
report_times(report, after, before)
committers = dict()
projects_active = dict()
reviewers = dict()
for change in changes.values():
namespace = change['project'].split("/")[0]
if namespace not in report['repo_namespaces']:
report['repo_namespaces'][namespace] = {
'changes_created': 0,
'changes_merged': 0,
'review_automated': 0,
'reviewer_messages': 0,
'revisions_pushed': 0,
}
if namespace not in projects_active:
projects_active[namespace] = set()
if after < from_gerrit_time(change['created']) < before:
# Note that the changes are not returned in chronological
# order, so we have to test all of them and can't short-circuit
# after the first change which was created too late
report['repo_namespaces'][namespace]['changes_created'] += 1
projects_active[namespace].add(change['project'])
if namespace not in committers:
committers[namespace] = set()
committers[namespace].add(change['owner']['_account_id'])
if verbose >= 2:
print("Found created change: %s" % change['_number'])
if ('submitted' in change and after < from_gerrit_time(
change['submitted']) < before):
report['repo_namespaces'][namespace]['changes_merged'] += 1
projects_active[namespace].add(change['project'])
if verbose >= 2:
print("Found merged change: %s" % change['_number'])
for revision in change['revisions'].values():
if after < from_gerrit_time(revision['created']) < before:
report['repo_namespaces'][namespace]['revisions_pushed'] += 1
projects_active[namespace].add(change['project'])
if namespace not in committers:
committers[namespace] = set()
committers[namespace].add(revision['uploader']['_account_id'])
if verbose >= 2:
print("Found change revision: %s,%s" % (
change['_number'], revision['_number']))
for message in change['messages']:
if after < from_gerrit_time(message['date']) < before:
if ('tag' in message and message['tag'].startswith(
'autogenerated:')):
report['repo_namespaces'][namespace][
'review_automated'] += 1
projects_active[namespace].add(change['project'])
if verbose >= 2:
print("Found automated comment: %s,%s,%s (%s)" % (
change['_number'],
message['_revision_number'],
message['id'],
message['date']))
elif not message['message'].startswith(
'Uploaded patch set'):
report['repo_namespaces'][namespace][
'reviewer_messages'] += 1
projects_active[namespace].add(change['project'])
if namespace not in reviewers:
reviewers[namespace] = set()
reviewers[namespace].add(message['author']['_account_id'])
if verbose >= 2:
print("Found reviewer comment: %s,%s,%s (%s)" % (
change['_number'],
message['_revision_number'],
message['id'],
message['date']))
all_committers = set()
for namespace in committers:
report['repo_namespaces'][namespace]['committers'] = len(
committers[namespace])
all_committers = all_committers.union(committers[namespace])
all_reviewers = set()
for namespace in reviewers:
report['repo_namespaces'][namespace]['reviewers'] = len(
reviewers[namespace])
all_reviewers = all_reviewers.union(reviewers[namespace])
for namespace in projects_active:
report['repo_namespaces'][namespace]['projects_active'] = len(
projects_active[namespace])
ml_activity = {'_total': {}}
for scalar_month in range(
after.year * 12 + after.month,
before.year * 12 + before.month):
yearmonth = ((scalar_month - 1) // 12, scalar_month % 12 or 12)
add_all_ml_activity(
ml_activity, get_ml_index(), yearmonth, verbose=verbose)
report['ml_sites'] = {}
for site in ml_activity:
report['ml_sites'][site] = {'posts': 0, 'senders': 0}
for posts in ml_activity[site].values():
report['ml_sites'][site]['posts'] += posts
report['ml_sites'][site]['senders'] += 1
chat_activity = {'_all_channels': {}}
channels = get_channels_list(verbose=verbose)
for channel in channels:
namespace = channel.split('-')[0].strip('#')
for scalar_month in range(
after.year * 12 + after.month,
before.year * 12 + before.month):
yearmonth = ((scalar_month - 1) // 12, scalar_month % 12 or 12)
for logs in get_channel_logs(channel, yearmonth, verbose=verbose):
add_chat_activity(
chat_activity, logs, namespace, verbose=verbose)
for namespace in chat_activity:
report['chat_namespaces'][namespace] = {
'messages': sum(chat_activity[namespace].values()),
'nicks': len(chat_activity[namespace]),
}
report['totals'] = dict()
report['totals']['active_repo_namespaces'] = len(report['repo_namespaces'])
report['totals']['committers'] = len(all_committers)
report['totals']['reviewers'] = len(all_reviewers)
additive_keys = (
'changes_created',
'changes_merged',
'projects_active',
'review_automated',
'reviewer_messages',
'revisions_pushed',
)
for key in additive_keys:
report['totals'][key] = 0
# Operate on a copy of the keys since we'll be altering the dict
for namespace in list(report['repo_namespaces'].keys()):
# Cull inactive namespaces from the report
if not report['repo_namespaces'][namespace]['projects_active']:
del report['repo_namespaces'][namespace]
continue
# Summation key totals
for key in additive_keys:
report['totals'][key] += report['repo_namespaces'][namespace][key]
report['totals']['mailing_list_posts'] = (
report['ml_sites']['_total']['posts'])
report['totals']['mailing_list_senders'] = (
report['ml_sites']['_total']['senders'])
del report['ml_sites']['_total']
report['totals']['mailing_list_sites'] = len(report['ml_sites'])
report['totals']['chat_messages_logged'] = sum(
chat_activity['_all_channels'].values())
report['totals']['chat_nicknames_logged'] = len(
chat_activity['_all_channels'])
del report['chat_namespaces']['_all_channels']
report['totals']['chat_channel_namespaces'] = len(
report['chat_namespaces'])
# Write a recording if requested
if mode == 'record':
os.makedirs(os.path.dirname(recordfn), exist_ok=True)
open(recordfn, 'w').write(yaml.dump(recording))
# Write the full YAML structured data report
os.makedirs('reports', exist_ok=True)
open('reports/%s.yaml' % argument, 'w').write(yaml.dump(report))
# Write the one-dimensional CSV tabular reports
for tabname in ('times', 'totals'):
table = [[argument, tabname]]
for rowname in report[tabname]:
table.append([rowname, report[tabname][rowname]])
csv.writer(open('reports/%s_%s.csv' % (
argument, tabname), 'w')).writerows(table)
# Write the two-dimensional CSV tabular reports
for tabname in ('chat_namespaces', 'ml_sites', 'repo_namespaces'):
table = [[argument]]
for colname in report[tabname]:
table[0].append(colname)
for rowname in report[tabname][colname]:
row_updated = False
for row in table[1:]:
if row[0] == rowname:
row.append(report[tabname][colname][rowname])
row_updated = True
break
if not row_updated:
table.append(
[rowname, report[tabname][colname][rowname]])
csv.writer(open('reports/%s_%s.csv' % (
argument, tabname), 'w')).writerows(table)