Fixed processing company-names for members

1) Erasing commas
2) Changing sequence of spaces into one space
3) Added three aliases

Change-Id: Ie7e15f73c027943e43a17a9b6245ad2fd1f6f36a
This commit is contained in:
pkholkin 2014-06-03 13:53:32 +04:00
parent 86a8393d6a
commit e40cb6857c
4 changed files with 12 additions and 5 deletions

View File

@ -5879,7 +5879,7 @@
{
"domains": [""],
"company_name": "*independent",
"aliases": ["None", "Non", "l-", ".", "****", "1", "aaa", "-", "dsadsadsadsad", "I dont have one", "company", "n/a", "Self", "Student", "home", "Home Based", "Independent", "Independen", "Independant", "MyHome", "HomeOffice", "Self Employeed", "Self Employed", "myself", "Self-employeed", "individual", "Individual Contributor", "Unaffiliated", "没有", "Null", "Univerisity", "fsfsf", "xxx"]
"aliases": ["None", "Non", "l-", ".", "****", "1", "aaa", "-", "dsadsadsadsad", "I dont have one", "company", "n/a", "Self", "Student", "home", "Home Based", "Independent", "Independen", "Independant", "MyHome", "HomeOffice", "Self Employeed", "Self Employed", "myself", "Self-employeed", "individual", "Individual Contributor", "Unaffiliated", "没有", "Null", "Univerisity", "fsfsf", "xxx", "no job"]
},
{
"domains": ["360.cn"],
@ -6229,7 +6229,7 @@
{
"domains": ["hp.com"],
"company_name": "HP",
"aliases": ["HP Cloud", "HP ES GD China", "HP, IBM", "HP Software", "HP Storage Division", "Hewlett Packard", "Hewlett-Packard Company", "Hewlett-Packard", "Hewllet-Packard", "HP R and D", "HP Cloud OS", "HP Networking", "hewelett-packard company", "HewlettPackard", "Hewlett-Pack"]
"aliases": ["HP Cloud", "HP ES GD China", "HP, IBM", "HP Software", "HP Storage Division", "Hewlett Packard", "Hewlett-Packard Company", "Hewlett-Packard", "Hewllet-Packard", "HP R and D", "HP Cloud OS", "HP Networking", "hewelett-packard company", "HewlettPackard", "Hewlett-Pack", "Hewlitt-Packard"]
},
{
"domains": ["huawei.com"],
@ -6239,7 +6239,7 @@
{
"domains": ["ibm.com", "linux.vnet.ibm.com"],
"company_name": "IBM",
"aliases": ["IBM Australia", "IBM Canada", "IBM Canada Ltd", "IBM China", "IBM Corporation", "IBM India Pvt Ltd", "IBM India Pvt. Ltd.", "IBM Japan, Ltd.", "IBM Research", "IBM Research - China", "IBM Research Lab, India", "IBM Deutschland Research & Development GmbH", "International Business Machines Corporation"]
"aliases": ["IBM Australia", "IBM Canada", "IBM Canada Ltd", "IBM China", "IBM Corporation", "IBM India Pvt Ltd", "IBM India Pvt. Ltd.", "IBM Japan, Ltd.", "IBM Research", "IBM Research - China", "IBM Research Lab, India", "IBM Deutschland Research & Development GmbH", "International Business Machines Corporation", "IBM UK Ltd"]
},
{
"domains": ["ifca.unican.es"],

View File

@ -179,7 +179,8 @@ def _get_changed_member_records(runtime_storage_inst, record_processor_inst):
if record['record_type'] == 'member' and 'company_name' in record:
company_draft = record['company_draft']
company_name = record_processor_inst.domains_index.get(
utils.normalize_company_name(company_draft)) or company_draft
utils.normalize_company_name(company_draft)) or (
utils.normalize_company_draft(company_draft))
if company_name != record['company_name']:
record['company_name'] = company_name

View File

@ -426,7 +426,7 @@ class RecordProcessor(object):
company_draft = record['company_draft']
company_name = self.domains_index.get(utils.normalize_company_name(
company_draft)) or company_draft
company_draft)) or (utils.normalize_company_draft(company_draft))
# author_email is a key to create new user
record['author_email'] = user_id

View File

@ -210,3 +210,9 @@ def normalize_company_name(name):
regex += '|' + '((^|\\s)(' + '|'.join(BAD_NAME_SUFFIXES_WITH_STOPS) + '))'
name = re.sub(re.compile(regex, re.IGNORECASE), '', name)
return ''.join([c.lower() for c in name if c.isalnum()])
def normalize_company_draft(name):
name = re.sub(',', ' ', name)
name = re.sub(r'\s+', ' ', name)
return name