Revert "Remove Foundation members report"

Foundation members report was useful to get data on new registrations.
Unfortunately user registration date can only be retrieved from HTML
and not available in OpenStackID-Resources API.

This reverts commit fd2ba43994.

Change-Id: I8d86cec906f516be5696c679176ba4919f18edc7
This commit is contained in:
Ilya Shakhat
2017-09-07 14:59:51 +00:00
parent fd2ba43994
commit 43ffa83fe4
24 changed files with 823 additions and 14 deletions

View File

@@ -32,6 +32,8 @@ PROCESSOR_OPTS = [
'default_data_uri = file:///path/to/default_data.json'),
cfg.StrOpt('sources-root', default='/var/local/stackalytics',
help='The folder that holds all project sources to analyze'),
cfg.IntOpt('days_to_update_members', default=30,
help='Number of days to update members'),
cfg.StrOpt('corrections-uri',
default=('https://git.openstack.org/cgit/'
'openstack/stackalytics/plain/etc/corrections.json'),
@@ -51,6 +53,8 @@ PROCESSOR_OPTS = [
cfg.StrOpt("fetching-user-source", default='launchpad',
choices=['launchpad', '<None>'],
help="Source for fetching user profiles"),
cfg.IntOpt('members-look-ahead', default=250,
help='How many member profiles to look ahead after the last'),
cfg.IntOpt('read-timeout', default=120,
help='Number of seconds to wait for remote response'),
cfg.IntOpt('gerrit-retry', default=10,

View File

@@ -43,7 +43,8 @@ OPTS = [
SINGLE_KEYS = ['module_groups', 'project_types', 'repos', 'releases',
'companies', 'runtime_storage_update_time']
'companies', 'last_update_members_date', 'last_member_index',
'runtime_storage_update_time']
ARRAY_KEYS = ['record', 'user']
BULK_READ_SIZE = 64
MEMCACHED_URI_PREFIX = r'^memcached:\/\/'
@@ -139,6 +140,8 @@ def export_data(memcached_inst, fd):
pickle.dump(('user:%s' % user['launchpad_id'], user), fd)
if user.get('gerrit_id'):
pickle.dump(('user:gerrit:%s' % user['gerrit_id'], user), fd)
if user.get('member_id'):
pickle.dump(('user:member:%s' % user['member_id'], user), fd)
for email in user.get('emails') or []:
pickle.dump((('user:%s' % email).encode('utf8'), user), fd)

View File

@@ -27,6 +27,7 @@ from stackalytics.processor import default_data_processor
from stackalytics.processor import governance
from stackalytics.processor import lp
from stackalytics.processor import mls
from stackalytics.processor import mps
from stackalytics.processor import rcs
from stackalytics.processor import record_processor
from stackalytics.processor import runtime_storage
@@ -192,6 +193,23 @@ def _process_translation_stats(runtime_storage_inst, record_processor_inst):
runtime_storage_inst.set_records(processed_translation_iterator)
def _process_member_list(uri, runtime_storage_inst, record_processor_inst):
member_iterator = mps.log(uri, runtime_storage_inst,
CONF.days_to_update_members,
CONF.members_look_ahead)
member_iterator_typed = _record_typer(member_iterator, 'member')
processed_member_iterator = record_processor_inst.process(
member_iterator_typed)
runtime_storage_inst.set_records(processed_member_iterator)
def update_members(runtime_storage_inst, record_processor_inst):
member_lists = runtime_storage_inst.get_by_key('member_lists') or []
for member_list in member_lists:
_process_member_list(member_list, runtime_storage_inst,
record_processor_inst)
def _post_process_records(record_processor_inst, repos):
LOG.debug('Build release index')
release_index = {}
@@ -308,6 +326,9 @@ def main():
apply_corrections(CONF.corrections_uri, runtime_storage_inst)
# long operation should be the last
update_members(runtime_storage_inst, record_processor_inst)
runtime_storage_inst.set_by_key('runtime_storage_update_time',
utils.date_to_timestamp('now'))
LOG.info('stackalytics-processor succeeded.')

View File

@@ -0,0 +1,111 @@
# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import re
import time
from oslo_log import log as logging
import requests
import six
from stackalytics.processor import utils
LOG = logging.getLogger(__name__)
NAME_AND_DATE_PATTERN = (r'<h3>(?P<member_name>[^<]*)[\s\S]*?'
r'<div class="span-\d last">(?P<date_joined>[^<]*)')
COMPANY_PATTERN = (r'<strong>Date\sJoined[\s\S]*?<b>(?P<company_draft>[^<]*)'
r'[\s\S]*?From\s(?P<date_from>[\s\S]*?)\(Current\)')
GARBAGE_PATTERN = r'[/\\~%^\*_]+'
def strip_garbage(s):
return re.sub(r'\s+', ' ', re.sub(GARBAGE_PATTERN, '', s))
def _retrieve_member(requests_session, uri, member_id, html_parser):
content = utils.read_uri(uri, session=requests_session)
if not content:
return {}
member = {}
for rec in re.finditer(NAME_AND_DATE_PATTERN, content):
result = rec.groupdict()
member['member_id'] = member_id
member['member_name'] = strip_garbage(result['member_name'])
member['date_joined'] = result['date_joined']
member['member_uri'] = uri
break
member['company_draft'] = '*independent'
for rec in re.finditer(COMPANY_PATTERN, content):
result = rec.groupdict()
member['company_draft'] = strip_garbage(
html_parser.unescape(result['company_draft']))
return member
def log(uri, runtime_storage_inst, days_to_update_members, members_look_ahead):
LOG.debug('Retrieving new openstack.org members')
last_update_members_date = runtime_storage_inst.get_by_key(
'last_update_members_date') or 0
last_member_index = runtime_storage_inst.get_by_key(
'last_member_index') or 0
end_update_date = int(time.time()) - days_to_update_members * 24 * 60 * 60
if last_update_members_date <= end_update_date:
last_member_index = 0
last_update_members_date = int(time.time())
runtime_storage_inst.set_by_key('last_update_members_date',
last_update_members_date)
cnt_empty = 0
cur_index = last_member_index + 1
html_parser = six.moves.html_parser.HTMLParser()
requests_session = requests.Session()
while cnt_empty < members_look_ahead:
profile_uri = uri + str(cur_index)
member = _retrieve_member(requests_session, profile_uri,
str(cur_index), html_parser)
if 'member_name' not in member:
cnt_empty += 1
cur_index += 1
continue
cnt_empty = 0
last_member_index = cur_index
cur_index += 1
LOG.debug('New member: %s', member['member_id'])
yield member
time.sleep(random.random() * 5)
requests_session.close()
LOG.debug('Last_member_index: %s', last_member_index)
runtime_storage_inst.set_by_key('last_member_index', last_member_index)

View File

@@ -425,6 +425,39 @@ class RecordProcessor(object):
yield bug_fixed
def _process_member(self, record):
user_id = user_processor.make_user_id(member_id=record['member_id'])
record['primary_key'] = user_id
record['date'] = utils.member_date_to_timestamp(record['date_joined'])
record['author_name'] = record['member_name']
record['module'] = 'unknown'
company_draft = record['company_draft']
company_name = self.domains_index.get(utils.normalize_company_name(
company_draft)) or (utils.normalize_company_draft(company_draft))
# author_email is a key to create new user
record['author_email'] = user_id
record['company_name'] = company_name
# _update_record_and_user function will create new user if needed
self._update_record_and_user(record)
record['company_name'] = company_name
user = user_processor.load_user(self.runtime_storage_inst,
user_id=user_id)
user['user_name'] = record['author_name']
user['companies'] = [{
'company_name': company_name,
'end_date': 0,
}]
user['company_name'] = company_name
user_processor.store_user(self.runtime_storage_inst, user)
record['company_name'] = company_name
yield record
def _process_translation(self, record):
# todo split translation and approval
translation = record.copy()
@@ -455,6 +488,7 @@ class RecordProcessor(object):
'email': self._process_email,
'bp': self._process_blueprint,
'bug': self._process_bug,
'member': self._process_member,
'i18n': self._process_translation,
}
@@ -681,6 +715,39 @@ class RecordProcessor(object):
self.runtime_storage_inst.set_records(
self._close_patch(cores, marks_patch['marks']))
def _update_members_company_name(self):
LOG.info('Update members with company names')
def record_handler(record):
if record['record_type'] != 'member':
return
company_draft = record['company_draft']
company_name = self.domains_index.get(
utils.normalize_company_name(company_draft)) or (
utils.normalize_company_draft(company_draft))
if company_name == record['company_name']:
return
LOG.debug('Update record %s, company name changed to %s',
record, company_name)
record['company_name'] = company_name
yield record
user = user_processor.load_user(self.runtime_storage_inst,
user_id=record['user_id'])
LOG.debug('Update user %s, company name changed to %s',
user, company_name)
user['companies'] = [{
'company_name': company_name,
'end_date': 0,
}]
user_processor.store_user(self.runtime_storage_inst, user)
yield record_handler
def _update_commits_with_module_alias(self):
LOG.info('Update record with aliases')
@@ -706,6 +773,7 @@ class RecordProcessor(object):
self._update_commits_with_module_alias,
self._update_blueprints_with_mention_info,
self._determine_core_contributors,
self._update_members_company_name,
self._update_marks_with_disagreement,
]

View File

@@ -225,6 +225,12 @@ default_data = {
"type": "string"
}
},
"member_lists": {
"type": "array",
"items": {
"type": "string"
}
},
"project_types": {
"type": "array",
"items": {

View File

@@ -22,11 +22,13 @@ ROBOTS = '*robots'
def make_user_id(emails=None, launchpad_id=None, gerrit_id=None,
github_id=None, zanata_id=None):
member_id=None, github_id=None, zanata_id=None):
if launchpad_id or emails:
return launchpad_id or emails[0]
if gerrit_id:
return 'gerrit:%s' % gerrit_id
if member_id:
return 'member:%s' % member_id
if github_id:
return 'github:%s' % github_id
if zanata_id:
@@ -58,11 +60,11 @@ def store_user(runtime_storage_inst, user):
def load_user(runtime_storage_inst, seq=None, user_id=None, email=None,
launchpad_id=None, gerrit_id=None, github_id=None,
zanata_id=None):
launchpad_id=None, gerrit_id=None, member_id=None,
github_id=None, zanata_id=None):
key = make_user_id(gerrit_id=gerrit_id, github_id=github_id,
zanata_id=zanata_id)
key = make_user_id(gerrit_id=gerrit_id, member_id=member_id,
github_id=github_id, zanata_id=zanata_id)
if not key:
key = seq or user_id or launchpad_id or email
if key:

View File

@@ -63,6 +63,13 @@ def date_to_timestamp_ext(d):
return int(d)
def member_date_to_timestamp(d):
if not d:
return 0
return int(time.mktime(
datetime.datetime.strptime(d, '%B %d, %Y ').timetuple()))
def iso8601_to_timestamp(s):
return calendar.timegm(iso8601.parse_date(s).utctimetuple())