engagement/engagement/stats.py

# Copyright OpenDev Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS
# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import csv
import datetime
import html.parser
import json
import os
import re
import sys
import urllib.parse

import requests
import yaml


def requester(
        url, params={}, headers={}, mode='live', recording=None, verbose=0):
    """A requests wrapper to consistently retry HTTPS queries"""

    # We key recordings of queries based on a tuple of their URL and parameters
    # (this may not be stable in Python<3.6 due to lack of dict ordering)
    if mode == 'replay':
        # In replay mode, use recorded results for all queries
        text = recording[(url, params)]
    else:
        # In live or record modes, actually use the remote API instead
        retry = requests.Session()
        retry.mount("https://", requests.adapters.HTTPAdapter(max_retries=3))
        response = retry.get(url=url, params=params, headers=headers)
        text = response.text
        if verbose >= 2:
            print("Queried: %s" % response.url)
        if mode == 'record':
            # In record mode, also save a copy of the query results to replay
            recording[(url, params)] = text

    return text


def decode_json(raw):
    """Trap JSON decoding failures and provide more detailed errors"""

    # Gerrit's REST API prepends a JSON-breaker to avoid XSS vulnerabilities
    if raw.startswith(")]}'"):
        trimmed = raw[4:]
    else:
        trimmed = raw

    # Try to decode and bail with much detail if it fails
    try:
        decoded = json.loads(trimmed)
    except Exception:
        print('\nrequest returned %s error to query:\n\n    %s\n'
              '\nwith detail:\n\n    %s\n' % (raw, raw.url, trimmed),
              file=sys.stderr)
        raise
    return decoded


def query_gerrit(method, params={}, mode='live', recording=None, verbose=0):
    """Query the Gerrit REST API and make or replay a recording"""

    url = 'https://review.opendev.org/%s' % method
    result = requester(
        url,
        params=params,
        headers={'Accept': 'application/json'},
        mode=mode,
        recording=recording,
        verbose=verbose)
    return decode_json(result)


def from_gerrit_time(when):
    """Translate a Gerrit date/time string into a naive datetime object."""

    return datetime.datetime.strptime(when.split('.')[0], '%Y-%m-%d %H:%M:%S')


def to_gerrit_time(when):
    """Translate a datetime object into a Gerrit date/time string."""

    return when.strftime('%Y-%m-%d %H:%M:%S')


def get_projects(recording=None, verbose=0):
    """Return a sorted list of all namespaced code projects in Gerrit"""

    all_projects = query_gerrit(
        'projects/', params={'type': 'code'}, recording=recording,
        verbose=verbose)
    projects = list()
    for (project, details) in all_projects.items():
        if '/' in project:
            projects.append(project)
    return sorted(projects)


def usage_error():
    """Write a generic usage message to stderr and exit nonzero"""

    sys.stderr.write(
        'ERROR: specify report period like YEAR, YEAR-H[1-2], YEAR-Q[1-4],\n'
        '       or YEAR-[01-12], optionally prefixed by record- or replay-\n'
        '       if you want to make a recording or reuse a prior recording\n')
    sys.exit(1)


def parse_report_period(when):
    """Parse a supplied report period string, returning a tuple of
    after and before datetime objects"""

    monthly = re.compile(r'^(\d{4})-(\d{2})$')
    quarterly = re.compile(r'^(\d{4})-q([1-4])$', re.IGNORECASE)
    halfyearly = re.compile(r'^(\d{4})-h([1-4])$', re.IGNORECASE)
    yearly = re.compile(r'^\d{4}$')
    if monthly.match(when):
        start_year = int(monthly.match(when).group(1))
        start_month = int(monthly.match(when).group(2))
        end_year = start_year + start_month // 12
        end_month = 1 + start_month % 12
    elif quarterly.match(when):
        start_year = int(quarterly.match(when).group(1))
        start_month = 1 + 3 * (int(quarterly.match(when).group(2)) - 1)
        end_year = start_year + (start_month + 2) // 12
        end_month = 1 + (start_month + 2) % 12
    elif halfyearly.match(when):
        start_year = int(halfyearly.match(when).group(1))
        start_month = 1 + 6 * (int(halfyearly.match(when).group(2)) - 1)
        end_year = start_year + (start_month + 5) // 12
        end_month = 1 + (start_month + 5) % 12
    elif yearly.match(when):
        start_year = int(yearly.match(when).group())
        start_month = 1
        end_year = start_year + 1
        end_month = 1
    else:
        usage_error()
    after = datetime.datetime(start_year, start_month, 1)
    before = datetime.datetime(end_year, end_month, 1)
    return after, before


def parse_command_line():
    """Parse the command line to obtain the report period, then return it"""

    if len(sys.argv) == 2:
        return sys.argv[1]
    else:
        usage_error()


def report_times(report, after, before):
    """Add timestamp values to provided report"""

    report['times'] = dict()
    report['times']['after'] = to_gerrit_time(after)
    report['times']['before'] = to_gerrit_time(before)
    report['times']['generated'] = to_gerrit_time(datetime.datetime.utcnow())
    return report


def get_ml_index(verbose=0):
    sites = yaml.safe_load(
        requester('http://lists.opendev.org/archives.yaml', verbose=verbose))
    return sites


def get_ml_archive(listname, site, yearmonth, verbose=0):
    year, month = yearmonth
    monthname = datetime.date(1, month, 1).strftime('%B')
    return requester('http://%s/pipermail/%s/%s-%s.txt' % (
        site,
        listname,
        year,
        monthname,
        ), verbose=verbose)


def add_ml_activity(ml_activity, site, archive):
    if archive:
        for line in archive.split('\n'):
            # Take care to avoid incorrectly matching on lines which
            # begin with the word From inside the message body
            fromline = re.match(
                    r'From ([^ ]+) at ([0-9A-Za-z\.-]+\.[0-9A-Za-z\.-]+)  '
                    r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat) '
                    r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) '
                    r'[ 123][0-9] [012][0-9]:[0-9]{2}:[0-9]{2} [0-9]{4}$',
                    line)
            if fromline:
                localpart, domainpart = fromline.groups()[:2]
                domainpart = domainpart.lower()
                address = '%s@%s' % (localpart, domainpart)
                if address.lower() in (
                        'build.starlingx@gmail.com',
                        'hudson@openstack.org',
                        'info@bitergia.com',
                        'infra-root@openstack.org',
                        'jenkins@openstack.org',
                        'no-reply@openstack.org',
                        'readthedocs@readthedocs.org',
                        'review@openstack.org',
                        'zuul@opendev.org',
                        'zuul@openstack.org',
                ) or domainpart in (
                        'bugs.launchpad.net',
                        'lists.airshipit.org',
                        'lists.katacontainers.io',
                        'lists.opendev.org',
                        'lists.openinfra.dev',
                        'lists.openstack.org',
                        'lists.starlingx.io',
                        'lists.zuul-ci.org',
                        'review.opendev.org',
                        'review.openstack.org',
                        'storyboard.openstack.org',
                        'storyboard.opendev.org',
                        'zuul.opendev.org',
                        'zuul.openstack.org',
                ):
                    continue
                if address in ml_activity[site]:
                    ml_activity[site][address] += 1
                else:
                    ml_activity[site][address] = 1
                if address in ml_activity['_total']:
                    ml_activity['_total'][address] += 1
                else:
                    ml_activity['_total'][address] = 1


def add_all_ml_activity(ml_activity, sites, yearmonth, verbose=0):
    for site in sites:
        if site not in ml_activity:
            ml_activity[site] = {}
        for listname in sites[site]:
            archive = get_ml_archive(
                listname, site, yearmonth, verbose=verbose)
            add_ml_activity(ml_activity, site, archive)


class ChannelsListParser(html.parser.HTMLParser):
    def __init__(self):
        self.channels = list()
        super().__init__()

    def handle_starttag(self, tag, attrs):
        if tag == 'a' and attrs[0][1].startswith('%23'):
            self.channels.append(urllib.parse.unquote(attrs[0][1].strip('/')))


def get_channels_list(verbose=0):
    parser = ChannelsListParser()
    parser.feed(
        requester('https://meetings.opendev.org/irclogs/', verbose=verbose))
    return parser.channels


class LogsListParser(html.parser.HTMLParser):
    def __init__(self):
        self.logs = list()
        super().__init__()

    def handle_starttag(self, tag, attrs):
        if tag == 'a' and attrs[0][1].startswith('%23'):
            self.logs.append(attrs[0][1])


def get_channel_logs(channel, yearmonth, verbose=0):
    year, month = yearmonth
    channel = urllib.parse.quote(channel)
    logs = ''
    parser = LogsListParser()
    parser.feed(requester(
        'https://meetings.opendev.org/irclogs/%s/' % channel, verbose=verbose))
    for day in range(1, 32):
        if '%s.%d-%02d-%02d.log.html' % (
                channel, year, month, day) not in parser.logs:
            continue
        result = requester(
            'https://meetings.opendev.org/irclogs/%s/%s.%d-%02d-%02d.log' % (
                channel,
                channel,
                year,
                month,
                day,
                ), verbose=verbose)
        if result:
            logs += result
    return logs.split('\n')


def add_chat_activity(chat_activity, logs, namespace, verbose=0):
    if logs:
        for line in logs.split('\n'):
            logline = re.match(r'[0-9T:-]{19}  <([^ ]+)> ', line)
            if logline:
                nick = logline.group(1).strip('@')
                if nick in (
                        'opendevmeet',
                        'opendevreview',
                        'opendevstatus',
                        'openstack',
                        'openstackgerrit',
                        'openstackstatus',
                        ):
                    continue
                if namespace not in chat_activity:
                    chat_activity[namespace] = {}
                    if verbose >= 1:
                        print("Adding namespace: %s" % namespace)
                if nick in chat_activity['_all_channels']:
                    chat_activity['_all_channels'][nick] += 1
                else:
                    chat_activity['_all_channels'][nick] = 1
                    if verbose >= 1:
                        print("Found chat nick: %s" % nick)
                if nick in chat_activity[namespace]:
                    chat_activity[namespace][nick] += 1
                else:
                    chat_activity[namespace][nick] = 1


def main(verbose=0):
    """Utility entry point"""

    argument = parse_command_line()
    if argument.startswith('record-'):
        mode = 'record'
        argument = argument[len(mode)+1:]
    elif argument.startswith('replay-'):
        mode = 'replay'
        argument = argument[len(mode)+1:]
    else:
        mode = 'live'

    recordfn = 'recordings/%s.yaml' % argument
    if mode == 'record':
        recording = {}
    elif mode == 'replay':
        recording = yaml.load(open(recordfn), loader=yaml.loader.safeLoader)
    else:
        recording = None

    after, before = parse_report_period(argument)
    changes = dict()

    # Shard querying by project, to help with the inherent instability of
    # result pagination from the Gerrit API
    for project in get_projects(recording=recording, verbose=verbose):
        if verbose >= 1:
            print("Checking project: %s" % project)
        offset = 0
        # Loop due to unavoidable query result pagination
        while offset >= 0:
            # We only constrain the query by the after date, as changes created
            # between the before and after date may have been updated more
            # recently with a new revision or comment
            new_changes = query_gerrit('changes/', params={
                'q': 'project:%s after:{%s}' % (
                    project, to_gerrit_time(after)),
                'no-limit': '1',
                'start': offset,
                'o': ['ALL_REVISIONS', 'MESSAGES', 'SKIP_DIFFSTAT'],
                }, recording=recording, verbose=verbose)
            # Since we redundantly query ranges with offsets to help combat
            # pagination instability, we must deduplicate results
            for change in new_changes:
                if change['id'] not in changes:
                    changes[change['id']] = change
            # Offset additional pages by half the returned entry count to help
            # avoid missing changes due to pagination instability
            if new_changes and new_changes[-1].get('_more_changes', False):
                offset += len(new_changes)/2
            else:
                offset = -1

    report = {
        'chat_namespaces': dict(),
        'ml_sites': dict(),
        'repo_namespaces': dict(),
        }
    report_times(report, after, before)
    committers = dict()
    projects_active = dict()
    reviewers = dict()
    for change in changes.values():
        namespace = change['project'].split("/")[0]
        if namespace not in report['repo_namespaces']:
            report['repo_namespaces'][namespace] = {
                'changes_created': 0,
                'changes_merged': 0,
                'review_automated': 0,
                'reviewer_messages': 0,
                'revisions_pushed': 0,
                }
        if namespace not in projects_active:
            projects_active[namespace] = set()
        if after < from_gerrit_time(change['created']) < before:
            # Note that the changes are not returned in chronological
            # order, so we have to test all of them and can't short-circuit
            # after the first change which was created too late
            report['repo_namespaces'][namespace]['changes_created'] += 1
            projects_active[namespace].add(change['project'])
            if namespace not in committers:
                committers[namespace] = set()
            committers[namespace].add(change['owner']['_account_id'])
            if verbose >= 2:
                print("Found created change: %s" % change['_number'])
        if ('submitted' in change and after < from_gerrit_time(
                change['submitted']) < before):
            report['repo_namespaces'][namespace]['changes_merged'] += 1
            projects_active[namespace].add(change['project'])
            if verbose >= 2:
                print("Found merged change: %s" % change['_number'])
        for revision in change['revisions'].values():
            if after < from_gerrit_time(revision['created']) < before:
                report['repo_namespaces'][namespace]['revisions_pushed'] += 1
                projects_active[namespace].add(change['project'])
                if namespace not in committers:
                    committers[namespace] = set()
                committers[namespace].add(revision['uploader']['_account_id'])
                if verbose >= 2:
                    print("Found change revision: %s,%s" % (
                        change['_number'], revision['_number']))
        for message in change['messages']:
            if after < from_gerrit_time(message['date']) < before:
                if ('tag' in message and message['tag'].startswith(
                        'autogenerated:')):
                    report['repo_namespaces'][namespace][
                        'review_automated'] += 1
                    projects_active[namespace].add(change['project'])
                    if verbose >= 2:
                        print("Found automated comment: %s,%s,%s (%s)" % (
                            change['_number'],
                            message['_revision_number'],
                            message['id'],
                            message['date']))
                elif not message['message'].startswith(
                        'Uploaded patch set'):
                    report['repo_namespaces'][namespace][
                        'reviewer_messages'] += 1
                    projects_active[namespace].add(change['project'])
                    if namespace not in reviewers:
                        reviewers[namespace] = set()
                    reviewers[namespace].add(message['author']['_account_id'])
                    if verbose >= 2:
                        print("Found reviewer comment: %s,%s,%s (%s)" % (
                            change['_number'],
                            message['_revision_number'],
                            message['id'],
                            message['date']))
    all_committers = set()
    for namespace in committers:
        report['repo_namespaces'][namespace]['committers'] = len(
            committers[namespace])
        all_committers = all_committers.union(committers[namespace])
    all_reviewers = set()
    for namespace in reviewers:
        report['repo_namespaces'][namespace]['reviewers'] = len(
            reviewers[namespace])
        all_reviewers = all_reviewers.union(reviewers[namespace])
    for namespace in projects_active:
        report['repo_namespaces'][namespace]['projects_active'] = len(
            projects_active[namespace])

    ml_activity = {'_total': {}}
    for scalar_month in range(
            after.year * 12 + after.month,
            before.year * 12 + before.month):
        yearmonth = ((scalar_month - 1) // 12, scalar_month % 12 or 12)
        add_all_ml_activity(
            ml_activity, get_ml_index(), yearmonth, verbose=verbose)
    report['ml_sites'] = {}
    for site in ml_activity:
        report['ml_sites'][site] = {'posts': 0, 'senders': 0}
        for posts in ml_activity[site].values():
            report['ml_sites'][site]['posts'] += posts
            report['ml_sites'][site]['senders'] += 1

    chat_activity = {'_all_channels': {}}
    channels = get_channels_list(verbose=verbose)
    for channel in channels:
        namespace = channel.split('-')[0].strip('#')
        for scalar_month in range(
                after.year * 12 + after.month,
                before.year * 12 + before.month):
            yearmonth = ((scalar_month - 1) // 12, scalar_month % 12 or 12)
            for logs in get_channel_logs(channel, yearmonth, verbose=verbose):
                add_chat_activity(
                    chat_activity, logs, namespace, verbose=verbose)
    for namespace in chat_activity:
        report['chat_namespaces'][namespace] = {
            'messages': sum(chat_activity[namespace].values()),
            'nicks': len(chat_activity[namespace]),
            }

    report['totals'] = dict()
    report['totals']['active_repo_namespaces'] = len(report['repo_namespaces'])
    report['totals']['committers'] = len(all_committers)
    report['totals']['reviewers'] = len(all_reviewers)
    additive_keys = (
        'changes_created',
        'changes_merged',
        'projects_active',
        'review_automated',
        'reviewer_messages',
        'revisions_pushed',
        )
    for key in additive_keys:
        report['totals'][key] = 0
    # Operate on a copy of the keys since we'll be altering the dict
    for namespace in list(report['repo_namespaces'].keys()):
        # Cull inactive namespaces from the report
        if not report['repo_namespaces'][namespace]['projects_active']:
            del report['repo_namespaces'][namespace]
            continue
        # Summation key totals
        for key in additive_keys:
            report['totals'][key] += report['repo_namespaces'][namespace][key]

    report['totals']['mailing_list_posts'] = (
        report['ml_sites']['_total']['posts'])
    report['totals']['mailing_list_senders'] = (
        report['ml_sites']['_total']['senders'])
    del report['ml_sites']['_total']
    report['totals']['mailing_list_sites'] = len(report['ml_sites'])

    report['totals']['chat_messages_logged'] = sum(
        chat_activity['_all_channels'].values())
    report['totals']['chat_nicknames_logged'] = len(
        chat_activity['_all_channels'])
    del report['chat_namespaces']['_all_channels']
    report['totals']['chat_channel_namespaces'] = len(
        report['chat_namespaces'])

    # Write a recording if requested
    if mode == 'record':
        os.makedirs(os.path.dirname(recordfn), exist_ok=True)
        open(recordfn, 'w').write(yaml.dump(recording))

    # Write the full YAML structured data report
    os.makedirs('reports', exist_ok=True)
    open('reports/%s.yaml' % argument, 'w').write(yaml.dump(report))

    # Write the one-dimensional CSV tabular reports
    for tabname in ('times', 'totals'):
        table = [[argument, tabname]]
        for rowname in report[tabname]:
            table.append([rowname, report[tabname][rowname]])
        csv.writer(open('reports/%s_%s.csv' % (
            argument, tabname), 'w')).writerows(table)

    # Write the two-dimensional CSV tabular reports
    for tabname in ('chat_namespaces', 'ml_sites', 'repo_namespaces'):
        table = [[argument]]
        for colname in report[tabname]:
            table[0].append(colname)
            for rowname in report[tabname][colname]:
                row_updated = False
                for row in table[1:]:
                    if row[0] == rowname:
                        row.append(report[tabname][colname][rowname])
                        row_updated = True
                        break
                if not row_updated:
                    table.append(
                        [rowname, report[tabname][colname][rowname]])
        csv.writer(open('reports/%s_%s.csv' % (
            argument, tabname), 'w')).writerows(table)