From e40cb6857c5b47ba41111d6f2a395c7cd4a3f76c Mon Sep 17 00:00:00 2001 From: pkholkin Date: Tue, 3 Jun 2014 13:53:32 +0400 Subject: [PATCH] Fixed processing company-names for members 1) Erasing commas 2) Changing sequence of spaces into one space 3) Added three aliases Change-Id: Ie7e15f73c027943e43a17a9b6245ad2fd1f6f36a --- etc/default_data.json | 6 +++--- stackalytics/processor/default_data_processor.py | 3 ++- stackalytics/processor/record_processor.py | 2 +- stackalytics/processor/utils.py | 6 ++++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/etc/default_data.json b/etc/default_data.json index c1be896c2..086df353b 100644 --- a/etc/default_data.json +++ b/etc/default_data.json @@ -5879,7 +5879,7 @@ { "domains": [""], "company_name": "*independent", - "aliases": ["None", "Non", "l-", ".", "****", "1", "aaa", "-", "dsadsadsadsad", "I dont have one", "company", "n/a", "Self", "Student", "home", "Home Based", "Independent", "Independen", "Independant", "MyHome", "HomeOffice", "Self Employeed", "Self Employed", "myself", "Self-employeed", "individual", "Individual Contributor", "Unaffiliated", "没有", "Null", "Univerisity", "fsfsf", "xxx"] + "aliases": ["None", "Non", "l-", ".", "****", "1", "aaa", "-", "dsadsadsadsad", "I dont have one", "company", "n/a", "Self", "Student", "home", "Home Based", "Independent", "Independen", "Independant", "MyHome", "HomeOffice", "Self Employeed", "Self Employed", "myself", "Self-employeed", "individual", "Individual Contributor", "Unaffiliated", "没有", "Null", "Univerisity", "fsfsf", "xxx", "no job"] }, { "domains": ["360.cn"], @@ -6229,7 +6229,7 @@ { "domains": ["hp.com"], "company_name": "HP", - "aliases": ["HP Cloud", "HP ES GD China", "HP, IBM", "HP Software", "HP Storage Division", "Hewlett Packard", "Hewlett-Packard Company", "Hewlett-Packard", "Hewllet-Packard", "HP R and D", "HP Cloud OS", "HP Networking", "hewelett-packard company", "HewlettPackard", "Hewlett-Pack"] + "aliases": ["HP Cloud", "HP ES GD China", "HP, IBM", "HP Software", "HP Storage Division", "Hewlett Packard", "Hewlett-Packard Company", "Hewlett-Packard", "Hewllet-Packard", "HP R and D", "HP Cloud OS", "HP Networking", "hewelett-packard company", "HewlettPackard", "Hewlett-Pack", "Hewlitt-Packard"] }, { "domains": ["huawei.com"], @@ -6239,7 +6239,7 @@ { "domains": ["ibm.com", "linux.vnet.ibm.com"], "company_name": "IBM", - "aliases": ["IBM Australia", "IBM Canada", "IBM Canada Ltd", "IBM China", "IBM Corporation", "IBM India Pvt Ltd", "IBM India Pvt. Ltd.", "IBM Japan, Ltd.", "IBM Research", "IBM Research - China", "IBM Research Lab, India", "IBM Deutschland Research & Development GmbH", "International Business Machines Corporation"] + "aliases": ["IBM Australia", "IBM Canada", "IBM Canada Ltd", "IBM China", "IBM Corporation", "IBM India Pvt Ltd", "IBM India Pvt. Ltd.", "IBM Japan, Ltd.", "IBM Research", "IBM Research - China", "IBM Research Lab, India", "IBM Deutschland Research & Development GmbH", "International Business Machines Corporation", "IBM UK Ltd"] }, { "domains": ["ifca.unican.es"], diff --git a/stackalytics/processor/default_data_processor.py b/stackalytics/processor/default_data_processor.py index ea24a14e0..a0540a907 100644 --- a/stackalytics/processor/default_data_processor.py +++ b/stackalytics/processor/default_data_processor.py @@ -179,7 +179,8 @@ def _get_changed_member_records(runtime_storage_inst, record_processor_inst): if record['record_type'] == 'member' and 'company_name' in record: company_draft = record['company_draft'] company_name = record_processor_inst.domains_index.get( - utils.normalize_company_name(company_draft)) or company_draft + utils.normalize_company_name(company_draft)) or ( + utils.normalize_company_draft(company_draft)) if company_name != record['company_name']: record['company_name'] = company_name diff --git a/stackalytics/processor/record_processor.py b/stackalytics/processor/record_processor.py index 14a31c1c8..231b6b9aa 100644 --- a/stackalytics/processor/record_processor.py +++ b/stackalytics/processor/record_processor.py @@ -426,7 +426,7 @@ class RecordProcessor(object): company_draft = record['company_draft'] company_name = self.domains_index.get(utils.normalize_company_name( - company_draft)) or company_draft + company_draft)) or (utils.normalize_company_draft(company_draft)) # author_email is a key to create new user record['author_email'] = user_id diff --git a/stackalytics/processor/utils.py b/stackalytics/processor/utils.py index 302b78a24..3bd76bfdf 100644 --- a/stackalytics/processor/utils.py +++ b/stackalytics/processor/utils.py @@ -210,3 +210,9 @@ def normalize_company_name(name): regex += '|' + '((^|\\s)(' + '|'.join(BAD_NAME_SUFFIXES_WITH_STOPS) + '))' name = re.sub(re.compile(regex, re.IGNORECASE), '', name) return ''.join([c.lower() for c in name if c.isalnum()]) + + +def normalize_company_draft(name): + name = re.sub(',', ' ', name) + name = re.sub(r'\s+', ' ', name) + return name