diff --git a/stackalytics/processor/utils.py b/stackalytics/processor/utils.py index 145af381f..508801adf 100644 --- a/stackalytics/processor/utils.py +++ b/stackalytics/processor/utils.py @@ -234,6 +234,10 @@ def safe_encode(s): return six.moves.urllib.parse.quote(s.encode('utf-8')) +def keep_safe_chars(s): + return re.sub(r'[^\x21-\x7e\x80-\xff]+', '', s) + + def make_module_group(module_group_id, name=None, modules=None, tag='module'): return {'id': module_group_id, 'module_group_name': name or module_group_id, diff --git a/stackalytics/processor/vcs.py b/stackalytics/processor/vcs.py index 161d02913..37c25e55e 100644 --- a/stackalytics/processor/vcs.py +++ b/stackalytics/processor/vcs.py @@ -211,6 +211,9 @@ class Git(Vcs): # ignore commits with empty email (there are some < Essex) continue + commit['author_email'] = utils.keep_safe_chars( + commit['author_email']) + diff_stat_str = rec.group('diff_stat') diff_rec = re.search(DIFF_STAT_PATTERN, diff_stat_str) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 2c71e1e34..a65a29670 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -78,6 +78,12 @@ class TestUtils(testtools.TestCase): self.assertEqual(expected, utils.add_index( sequence, start=0, item_filter=lambda x: x['name'] != 'B')) + def test_keep_safe_chars(self): + self.assertEqual('somemoretext', + utils.keep_safe_chars('some more text')) + self.assertEqual(u'(unicode)', + utils.keep_safe_chars(u'(unicode \u0423) ')) + def test_normalize_company_name(self): company_names = ['EMC Corporation', 'Abc, corp..', 'Mirantis IT.', 'Red Hat, Inc.', 'abc s.r.o. ABC', '2s.r.o. co',