Revert "Remove Foundation members report"
Foundation members report was useful to get data on new registrations.
Unfortunately user registration date can only be retrieved from HTML
and not available in OpenStackID-Resources API.
This reverts commit fd2ba43994.
Change-Id: I8d86cec906f516be5696c679176ba4919f18edc7
This commit is contained in:
@@ -32,6 +32,8 @@ PROCESSOR_OPTS = [
|
||||
'default_data_uri = file:///path/to/default_data.json'),
|
||||
cfg.StrOpt('sources-root', default='/var/local/stackalytics',
|
||||
help='The folder that holds all project sources to analyze'),
|
||||
cfg.IntOpt('days_to_update_members', default=30,
|
||||
help='Number of days to update members'),
|
||||
cfg.StrOpt('corrections-uri',
|
||||
default=('https://git.openstack.org/cgit/'
|
||||
'openstack/stackalytics/plain/etc/corrections.json'),
|
||||
@@ -51,6 +53,8 @@ PROCESSOR_OPTS = [
|
||||
cfg.StrOpt("fetching-user-source", default='launchpad',
|
||||
choices=['launchpad', '<None>'],
|
||||
help="Source for fetching user profiles"),
|
||||
cfg.IntOpt('members-look-ahead', default=250,
|
||||
help='How many member profiles to look ahead after the last'),
|
||||
cfg.IntOpt('read-timeout', default=120,
|
||||
help='Number of seconds to wait for remote response'),
|
||||
cfg.IntOpt('gerrit-retry', default=10,
|
||||
|
||||
@@ -43,7 +43,8 @@ OPTS = [
|
||||
|
||||
|
||||
SINGLE_KEYS = ['module_groups', 'project_types', 'repos', 'releases',
|
||||
'companies', 'runtime_storage_update_time']
|
||||
'companies', 'last_update_members_date', 'last_member_index',
|
||||
'runtime_storage_update_time']
|
||||
ARRAY_KEYS = ['record', 'user']
|
||||
BULK_READ_SIZE = 64
|
||||
MEMCACHED_URI_PREFIX = r'^memcached:\/\/'
|
||||
@@ -139,6 +140,8 @@ def export_data(memcached_inst, fd):
|
||||
pickle.dump(('user:%s' % user['launchpad_id'], user), fd)
|
||||
if user.get('gerrit_id'):
|
||||
pickle.dump(('user:gerrit:%s' % user['gerrit_id'], user), fd)
|
||||
if user.get('member_id'):
|
||||
pickle.dump(('user:member:%s' % user['member_id'], user), fd)
|
||||
for email in user.get('emails') or []:
|
||||
pickle.dump((('user:%s' % email).encode('utf8'), user), fd)
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ from stackalytics.processor import default_data_processor
|
||||
from stackalytics.processor import governance
|
||||
from stackalytics.processor import lp
|
||||
from stackalytics.processor import mls
|
||||
from stackalytics.processor import mps
|
||||
from stackalytics.processor import rcs
|
||||
from stackalytics.processor import record_processor
|
||||
from stackalytics.processor import runtime_storage
|
||||
@@ -192,6 +193,23 @@ def _process_translation_stats(runtime_storage_inst, record_processor_inst):
|
||||
runtime_storage_inst.set_records(processed_translation_iterator)
|
||||
|
||||
|
||||
def _process_member_list(uri, runtime_storage_inst, record_processor_inst):
|
||||
member_iterator = mps.log(uri, runtime_storage_inst,
|
||||
CONF.days_to_update_members,
|
||||
CONF.members_look_ahead)
|
||||
member_iterator_typed = _record_typer(member_iterator, 'member')
|
||||
processed_member_iterator = record_processor_inst.process(
|
||||
member_iterator_typed)
|
||||
runtime_storage_inst.set_records(processed_member_iterator)
|
||||
|
||||
|
||||
def update_members(runtime_storage_inst, record_processor_inst):
|
||||
member_lists = runtime_storage_inst.get_by_key('member_lists') or []
|
||||
for member_list in member_lists:
|
||||
_process_member_list(member_list, runtime_storage_inst,
|
||||
record_processor_inst)
|
||||
|
||||
|
||||
def _post_process_records(record_processor_inst, repos):
|
||||
LOG.debug('Build release index')
|
||||
release_index = {}
|
||||
@@ -308,6 +326,9 @@ def main():
|
||||
|
||||
apply_corrections(CONF.corrections_uri, runtime_storage_inst)
|
||||
|
||||
# long operation should be the last
|
||||
update_members(runtime_storage_inst, record_processor_inst)
|
||||
|
||||
runtime_storage_inst.set_by_key('runtime_storage_update_time',
|
||||
utils.date_to_timestamp('now'))
|
||||
LOG.info('stackalytics-processor succeeded.')
|
||||
|
||||
111
stackalytics/processor/mps.py
Normal file
111
stackalytics/processor/mps.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# Copyright (c) 2013 Mirantis Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
from oslo_log import log as logging
|
||||
import requests
|
||||
import six
|
||||
|
||||
from stackalytics.processor import utils
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
NAME_AND_DATE_PATTERN = (r'<h3>(?P<member_name>[^<]*)[\s\S]*?'
|
||||
r'<div class="span-\d last">(?P<date_joined>[^<]*)')
|
||||
COMPANY_PATTERN = (r'<strong>Date\sJoined[\s\S]*?<b>(?P<company_draft>[^<]*)'
|
||||
r'[\s\S]*?From\s(?P<date_from>[\s\S]*?)\(Current\)')
|
||||
GARBAGE_PATTERN = r'[/\\~%^\*_]+'
|
||||
|
||||
|
||||
def strip_garbage(s):
|
||||
return re.sub(r'\s+', ' ', re.sub(GARBAGE_PATTERN, '', s))
|
||||
|
||||
|
||||
def _retrieve_member(requests_session, uri, member_id, html_parser):
|
||||
|
||||
content = utils.read_uri(uri, session=requests_session)
|
||||
|
||||
if not content:
|
||||
return {}
|
||||
|
||||
member = {}
|
||||
|
||||
for rec in re.finditer(NAME_AND_DATE_PATTERN, content):
|
||||
result = rec.groupdict()
|
||||
|
||||
member['member_id'] = member_id
|
||||
member['member_name'] = strip_garbage(result['member_name'])
|
||||
member['date_joined'] = result['date_joined']
|
||||
member['member_uri'] = uri
|
||||
break
|
||||
|
||||
member['company_draft'] = '*independent'
|
||||
for rec in re.finditer(COMPANY_PATTERN, content):
|
||||
result = rec.groupdict()
|
||||
|
||||
member['company_draft'] = strip_garbage(
|
||||
html_parser.unescape(result['company_draft']))
|
||||
|
||||
return member
|
||||
|
||||
|
||||
def log(uri, runtime_storage_inst, days_to_update_members, members_look_ahead):
|
||||
LOG.debug('Retrieving new openstack.org members')
|
||||
|
||||
last_update_members_date = runtime_storage_inst.get_by_key(
|
||||
'last_update_members_date') or 0
|
||||
last_member_index = runtime_storage_inst.get_by_key(
|
||||
'last_member_index') or 0
|
||||
|
||||
end_update_date = int(time.time()) - days_to_update_members * 24 * 60 * 60
|
||||
|
||||
if last_update_members_date <= end_update_date:
|
||||
last_member_index = 0
|
||||
last_update_members_date = int(time.time())
|
||||
|
||||
runtime_storage_inst.set_by_key('last_update_members_date',
|
||||
last_update_members_date)
|
||||
|
||||
cnt_empty = 0
|
||||
cur_index = last_member_index + 1
|
||||
html_parser = six.moves.html_parser.HTMLParser()
|
||||
requests_session = requests.Session()
|
||||
|
||||
while cnt_empty < members_look_ahead:
|
||||
|
||||
profile_uri = uri + str(cur_index)
|
||||
member = _retrieve_member(requests_session, profile_uri,
|
||||
str(cur_index), html_parser)
|
||||
|
||||
if 'member_name' not in member:
|
||||
cnt_empty += 1
|
||||
cur_index += 1
|
||||
continue
|
||||
|
||||
cnt_empty = 0
|
||||
last_member_index = cur_index
|
||||
cur_index += 1
|
||||
LOG.debug('New member: %s', member['member_id'])
|
||||
yield member
|
||||
|
||||
time.sleep(random.random() * 5)
|
||||
|
||||
requests_session.close()
|
||||
LOG.debug('Last_member_index: %s', last_member_index)
|
||||
runtime_storage_inst.set_by_key('last_member_index', last_member_index)
|
||||
@@ -425,6 +425,39 @@ class RecordProcessor(object):
|
||||
|
||||
yield bug_fixed
|
||||
|
||||
def _process_member(self, record):
|
||||
user_id = user_processor.make_user_id(member_id=record['member_id'])
|
||||
record['primary_key'] = user_id
|
||||
record['date'] = utils.member_date_to_timestamp(record['date_joined'])
|
||||
record['author_name'] = record['member_name']
|
||||
record['module'] = 'unknown'
|
||||
company_draft = record['company_draft']
|
||||
|
||||
company_name = self.domains_index.get(utils.normalize_company_name(
|
||||
company_draft)) or (utils.normalize_company_draft(company_draft))
|
||||
|
||||
# author_email is a key to create new user
|
||||
record['author_email'] = user_id
|
||||
record['company_name'] = company_name
|
||||
# _update_record_and_user function will create new user if needed
|
||||
self._update_record_and_user(record)
|
||||
record['company_name'] = company_name
|
||||
user = user_processor.load_user(self.runtime_storage_inst,
|
||||
user_id=user_id)
|
||||
|
||||
user['user_name'] = record['author_name']
|
||||
user['companies'] = [{
|
||||
'company_name': company_name,
|
||||
'end_date': 0,
|
||||
}]
|
||||
user['company_name'] = company_name
|
||||
|
||||
user_processor.store_user(self.runtime_storage_inst, user)
|
||||
|
||||
record['company_name'] = company_name
|
||||
|
||||
yield record
|
||||
|
||||
def _process_translation(self, record):
|
||||
# todo split translation and approval
|
||||
translation = record.copy()
|
||||
@@ -455,6 +488,7 @@ class RecordProcessor(object):
|
||||
'email': self._process_email,
|
||||
'bp': self._process_blueprint,
|
||||
'bug': self._process_bug,
|
||||
'member': self._process_member,
|
||||
'i18n': self._process_translation,
|
||||
}
|
||||
|
||||
@@ -681,6 +715,39 @@ class RecordProcessor(object):
|
||||
self.runtime_storage_inst.set_records(
|
||||
self._close_patch(cores, marks_patch['marks']))
|
||||
|
||||
def _update_members_company_name(self):
|
||||
LOG.info('Update members with company names')
|
||||
|
||||
def record_handler(record):
|
||||
if record['record_type'] != 'member':
|
||||
return
|
||||
|
||||
company_draft = record['company_draft']
|
||||
company_name = self.domains_index.get(
|
||||
utils.normalize_company_name(company_draft)) or (
|
||||
utils.normalize_company_draft(company_draft))
|
||||
|
||||
if company_name == record['company_name']:
|
||||
return
|
||||
|
||||
LOG.debug('Update record %s, company name changed to %s',
|
||||
record, company_name)
|
||||
record['company_name'] = company_name
|
||||
|
||||
yield record
|
||||
|
||||
user = user_processor.load_user(self.runtime_storage_inst,
|
||||
user_id=record['user_id'])
|
||||
LOG.debug('Update user %s, company name changed to %s',
|
||||
user, company_name)
|
||||
user['companies'] = [{
|
||||
'company_name': company_name,
|
||||
'end_date': 0,
|
||||
}]
|
||||
user_processor.store_user(self.runtime_storage_inst, user)
|
||||
|
||||
yield record_handler
|
||||
|
||||
def _update_commits_with_module_alias(self):
|
||||
LOG.info('Update record with aliases')
|
||||
|
||||
@@ -706,6 +773,7 @@ class RecordProcessor(object):
|
||||
self._update_commits_with_module_alias,
|
||||
self._update_blueprints_with_mention_info,
|
||||
self._determine_core_contributors,
|
||||
self._update_members_company_name,
|
||||
self._update_marks_with_disagreement,
|
||||
]
|
||||
|
||||
|
||||
@@ -225,6 +225,12 @@ default_data = {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"member_lists": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"project_types": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
||||
@@ -22,11 +22,13 @@ ROBOTS = '*robots'
|
||||
|
||||
|
||||
def make_user_id(emails=None, launchpad_id=None, gerrit_id=None,
|
||||
github_id=None, zanata_id=None):
|
||||
member_id=None, github_id=None, zanata_id=None):
|
||||
if launchpad_id or emails:
|
||||
return launchpad_id or emails[0]
|
||||
if gerrit_id:
|
||||
return 'gerrit:%s' % gerrit_id
|
||||
if member_id:
|
||||
return 'member:%s' % member_id
|
||||
if github_id:
|
||||
return 'github:%s' % github_id
|
||||
if zanata_id:
|
||||
@@ -58,11 +60,11 @@ def store_user(runtime_storage_inst, user):
|
||||
|
||||
|
||||
def load_user(runtime_storage_inst, seq=None, user_id=None, email=None,
|
||||
launchpad_id=None, gerrit_id=None, github_id=None,
|
||||
zanata_id=None):
|
||||
launchpad_id=None, gerrit_id=None, member_id=None,
|
||||
github_id=None, zanata_id=None):
|
||||
|
||||
key = make_user_id(gerrit_id=gerrit_id, github_id=github_id,
|
||||
zanata_id=zanata_id)
|
||||
key = make_user_id(gerrit_id=gerrit_id, member_id=member_id,
|
||||
github_id=github_id, zanata_id=zanata_id)
|
||||
if not key:
|
||||
key = seq or user_id or launchpad_id or email
|
||||
if key:
|
||||
|
||||
@@ -63,6 +63,13 @@ def date_to_timestamp_ext(d):
|
||||
return int(d)
|
||||
|
||||
|
||||
def member_date_to_timestamp(d):
|
||||
if not d:
|
||||
return 0
|
||||
return int(time.mktime(
|
||||
datetime.datetime.strptime(d, '%B %d, %Y ').timetuple()))
|
||||
|
||||
|
||||
def iso8601_to_timestamp(s):
|
||||
return calendar.timegm(iso8601.parse_date(s).utctimetuple())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user