diff --git a/stackalytics/processor/default_data_processor.py b/stackalytics/processor/default_data_processor.py index 5dd08ce01..d2270b3c7 100644 --- a/stackalytics/processor/default_data_processor.py +++ b/stackalytics/processor/default_data_processor.py @@ -140,6 +140,7 @@ def _store_users(runtime_storage_inst, users): if stored_user: stored_user.update(user) user = stored_user + user['static'] = True utils.store_user(runtime_storage_inst, user) diff --git a/stackalytics/processor/record_processor.py b/stackalytics/processor/record_processor.py index 1dae6ac6f..aa079c8bf 100644 --- a/stackalytics/processor/record_processor.py +++ b/stackalytics/processor/record_processor.py @@ -163,10 +163,13 @@ class RecordProcessor(object): # collect ordinary fields for key in ['seq', 'user_name', 'user_id', - 'launchpad_id', 'companies']: + 'launchpad_id', 'companies', 'static']: merged_user[key] = next((v.get(key) for v in user_profiles if v.get(key)), None) + if not merged_user['static']: + del merged_user['static'] + # update user_id, prefer it to be equal to launchpad_id merged_user['user_id'] = (merged_user['launchpad_id'] or merged_user['user_id']) @@ -248,9 +251,11 @@ class RecordProcessor(object): record['author_name'] = user['user_name'] company, policy = self._find_company(user['companies'], record['date']) - if company != '*robots' and policy == 'open': - company = (self._get_company_by_email(record.get('author_email')) - or company) + if not user.get('static'): + # for auto-generated profiles affiliation may be overridden + if company != '*robots' and policy == 'open': + company = (self._get_company_by_email( + record.get('author_email')) or company) record['company_name'] = company def _process_commit(self, record): diff --git a/tests/unit/test_record_processor.py b/tests/unit/test_record_processor.py index 540c7ccc0..b57c01326 100644 --- a/tests/unit/test_record_processor.py +++ b/tests/unit/test_record_processor.py @@ -208,6 +208,38 @@ class TestRecordProcessor(testtools.TestCase): self.assertIn('johndoe@ibm.com', utils.load_user( record_processor_inst.runtime_storage_inst, 'john_doe')['emails']) + def test_process_commit_existing_user_new_email_known_company_static(self): + # User profile is configured in default_data. Email is new to us, + # and maps to other company. We still use a company specified + # in the profile + record_processor_inst = self.make_record_processor( + users=[ + {'user_id': 'john_doe', + 'launchpad_id': 'john_doe', + 'user_name': 'John Doe', + 'static': True, + 'emails': ['johndoe@nec.co.jp'], + 'companies': [{'company_name': 'NEC', 'end_date': 0}]} + ], + companies=[{'company_name': 'IBM', 'domains': ['ibm.com']}], + lp_info={'johndoe@ibm.com': + {'name': 'john_doe', 'display_name': 'John Doe'}}) + + processed_commit = list(record_processor_inst.process( + generate_commits(author_email='johndoe@ibm.com', + author_name='John Doe')))[0] + + expected_commit = { + 'launchpad_id': 'john_doe', + 'author_email': 'johndoe@ibm.com', + 'author_name': 'John Doe', + 'company_name': 'NEC', + } + + self.assertRecordsMatch(expected_commit, processed_commit) + self.assertIn('johndoe@ibm.com', utils.load_user( + record_processor_inst.runtime_storage_inst, 'john_doe')['emails']) + def test_process_commit_existing_user_old_job_not_overridden(self): # User is known to LP, his email is new to us, and maps to other # company. Have some record with new email, but from the period when