stackalytics/stackalytics/processor/record_processor.py

287 lines
10 KiB
Python

# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bisect
import re
from stackalytics.openstack.common import log as logging
from stackalytics.processor import normalizer
from stackalytics.processor import utils
LOG = logging.getLogger(__name__)
class RecordProcessor(object):
def __init__(self, runtime_storage_inst):
self.runtime_storage_inst = runtime_storage_inst
self.domains_index = runtime_storage_inst.get_by_key('companies')
self.users_index = runtime_storage_inst.get_by_key('users')
self.releases = runtime_storage_inst.get_by_key('releases')
self.releases_dates = [r['end_date'] for r in self.releases]
self.updated_users = set()
def _get_release(self, timestamp):
release_index = bisect.bisect(self.releases_dates, timestamp)
return self.releases[release_index]['release_name']
def _find_company(self, companies, date):
for r in companies:
if date < r['end_date']:
return r['company_name']
return companies[-1]['company_name']
def _get_company_by_email(self, email):
name, at, domain = email.partition('@')
if domain:
parts = domain.split('.')
for i in range(len(parts), 1, -1):
m = '.'.join(parts[len(parts) - i:])
if m in self.domains_index:
return self.domains_index[m]
return None
def _create_user(self, launchpad_id, email, user_name):
company = (self._get_company_by_email(email) or
self._get_independent())
user = {
'user_id': normalizer.get_user_id(launchpad_id, email),
'launchpad_id': launchpad_id,
'user_name': user_name,
'emails': [email],
'companies': [{
'company_name': company,
'end_date': 0,
}],
}
normalizer.normalize_user(user)
LOG.debug('Create new user: %s', user)
return user
def _get_lp_info(self, email):
lp_profile = None
if not re.match(r'[\w\d_\.-]+@([\w\d_\.-]+\.)+[\w]+', email):
LOG.debug('User email is not valid %s' % email)
else:
LOG.debug('Lookup user email %s at Launchpad' % email)
uri = ('https://api.launchpad.net/1.0/people/?'
'ws.op=getByEmail&email=%s' % email)
lp_profile = utils.read_json_from_uri(uri)
if not lp_profile:
LOG.debug('User with email %s not found', email)
return None, None
LOG.debug('Email is mapped to launchpad user: %s', lp_profile['name'])
return lp_profile['name'], lp_profile['display_name']
def _get_independent(self):
return self.domains_index['']
def _update_user(self, user, email):
LOG.debug('Add email %s to user %s', email, user['user_id'])
user['emails'].append(email)
company_name = self._get_company_by_email(email)
if ((company_name) and (len(user['companies']) == 1) and
(user['companies'][0]['company_name'] != company_name)):
LOG.debug('Updating affiliation of user %s to %s',
user['user_id'], company_name)
user['companies'][0]['company_name'] = company_name
self.updated_users.add(user['user_id'])
def _update_record_and_user(self, record):
email = record['author_email'].lower()
record['author_email'] = email
if email in self.users_index:
user = self.users_index[email]
record['launchpad_id'] = user['launchpad_id']
else:
if ('launchpad_id' in record) and (record['launchpad_id']):
launchpad_id = record['launchpad_id']
user_name = record['author_name']
else:
launchpad_id, user_name = self._get_lp_info(email)
record['launchpad_id'] = launchpad_id
if (launchpad_id) and (launchpad_id in self.users_index):
# merge emails
user = self.users_index[launchpad_id]
self._update_user(user, email)
else:
# create new
if not user_name:
user_name = record['author_name']
user = self._create_user(launchpad_id, email, user_name)
utils.store_user(self.runtime_storage_inst, user)
self.users_index[email] = user
if user['launchpad_id']:
self.users_index[user['launchpad_id']] = user
record['user_id'] = user['user_id']
company_by_user = self._find_company(user['companies'], record['date'])
if company_by_user == '*robots':
# don't map robots by email
company = company_by_user
else:
company = self._get_company_by_email(email)
if not company:
company = company_by_user
record['company_name'] = company
if ('user_name' in user) and (user['user_name']):
record['author_name'] = user['user_name']
def _process_commit(self, record):
record['primary_key'] = record['commit_id']
record['loc'] = record['lines_added'] + record['lines_deleted']
self._update_record_and_user(record)
if record['company_name'] != '*robots':
yield record
def _spawn_review(self, record):
# copy everything except pathsets and flatten user data
review = dict([(k, v) for k, v in record.iteritems()
if k not in ['patchSets', 'owner', 'createdOn']])
owner = record['owner']
if 'email' not in owner or 'username' not in owner:
return # ignore
review['primary_key'] = review['id']
review['launchpad_id'] = owner['username']
review['author_name'] = owner['name']
review['author_email'] = owner['email']
review['date'] = record['createdOn']
self._update_record_and_user(review)
yield review
def _spawn_marks(self, record):
review_id = record['id']
module = record['module']
for patch in record['patchSets']:
if 'approvals' not in patch:
continue # not reviewed by anyone
for approval in patch['approvals']:
# copy everything and flatten user data
mark = dict([(k, v) for k, v in approval.iteritems()
if k not in ['by', 'grantedOn']])
reviewer = approval['by']
if 'email' not in reviewer or 'username' not in reviewer:
continue # ignore
mark['record_type'] = 'mark'
mark['date'] = approval['grantedOn']
mark['primary_key'] = (record['id'] +
str(mark['date']) +
mark['type'])
mark['launchpad_id'] = reviewer['username']
mark['author_name'] = reviewer['name']
mark['author_email'] = reviewer['email']
mark['module'] = module
mark['review_id'] = review_id
self._update_record_and_user(mark)
yield mark
def _process_review(self, record):
"""
Process a review. Review spawns into records of two types:
* review - records that a user created review request
* mark - records that a user set approval mark to given review
"""
for gen in [self._spawn_review, self._spawn_marks]:
for r in gen(record):
yield r
def _apply_type_based_processing(self, record):
if record['record_type'] == 'commit':
for r in self._process_commit(record):
yield r
elif record['record_type'] == 'review':
for r in self._process_review(record):
yield r
def process(self, record_iterator):
for record in record_iterator:
for r in self._apply_type_based_processing(record):
if r['company_name'] == '*robots':
continue
r['week'] = utils.timestamp_to_week(r['date'])
if ('release' not in r) or (not r['release']):
r['release'] = self._get_release(r['date'])
yield r
self.runtime_storage_inst.set_by_key('users', self.users_index)
def update(self, record_iterator, release_index):
for record in record_iterator:
need_update = False
company_name = record['company_name']
user_id = record['user_id']
author_name = record['author_name']
self._update_record_and_user(record)
if ((record['company_name'] != company_name) or
(record['user_id'] != user_id) or
(record['author_name'] != author_name)):
need_update = True
if record['primary_key'] in release_index:
release = release_index[record['primary_key']]
else:
release = self._get_release(record['date'])
if record['release'] != release:
need_update = True
record['release'] = release
if need_update:
yield record
self.runtime_storage_inst.set_by_key('users', self.users_index)
def _get_records_for_users_to_update(self):
for record in self.runtime_storage_inst.get_all_records():
user_id = record['user_id']
if user_id in self.updated_users:
user = self.users_index[user_id]
user_company_name = user['companies'][0]['company_name']
if record['company_name'] != user_company_name:
LOG.debug('Record company will be changed to: %s',
user_company_name)
record['company_name'] = user_company_name
yield record
def finalize(self):
self.runtime_storage_inst.set_records(
self._get_records_for_users_to_update())