From b9aae47c6c4e6f4ccdd044c0e13281bd618b9b86 Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Tue, 1 May 2018 17:30:52 -0400 Subject: [PATCH] add --anonymize flag to the summarize command This flag can help us look at numbers without being distracted by who is making the contributions, which is useful for understanding if a single organization is contributing disproportionately relative to others. Change-Id: I6fcaf4cc19441f162ec6b9c6f3ee8bd780878a76 Signed-off-by: Doug Hellmann --- goal_tools/tests/test_summarize.py | 55 ++++++++++++++++++++++++++++++ goal_tools/who_helped/summarize.py | 46 +++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/goal_tools/tests/test_summarize.py b/goal_tools/tests/test_summarize.py index 5beff61..17e0058 100644 --- a/goal_tools/tests/test_summarize.py +++ b/goal_tools/tests/test_summarize.py @@ -64,3 +64,58 @@ class TestSummarizeBy(base.TestCase): ('A',): 2, } self.assertEqual(expected, results) + + +class TestAnonymize(base.TestCase): + + def test_anonymizer(self): + a = summarize.Anonymizer('Field') + self.assertEqual('Field 1', a('anything')) + self.assertEqual('Field 1', a('anything')) + self.assertEqual('Field 2', a('anything else')) + self.assertEqual('Field 2', a('anything else')) + + def test_not_needed(self): + original = [('a', 'b', 1)] + group_by = ('Field1', 'Field2') + actual = list(summarize.anonymize(group_by, original)) + self.assertEqual(original, actual) + + def test_organization(self): + original = [ + ('a', 'b', 2), + ('c', 'd', 1), + ] + group_by = ['Organization', 'Field2'] + expected = [ + ('Organization 1', 'b', 2), + ('Organization 2', 'd', 1), + ] + actual = list(summarize.anonymize(group_by, original)) + self.assertEqual(expected, actual) + + def test_name(self): + original = [ + ('a', 'b', 2), + ('c', 'd', 1), + ] + group_by = ['Field1', 'Name'] + expected = [ + ('a', 'Name 1', 2), + ('c', 'Name 2', 1), + ] + actual = list(summarize.anonymize(group_by, original)) + self.assertEqual(expected, actual) + + def test_email(self): + original = [ + ('a', 'b', 2), + ('c', 'd', 1), + ] + group_by = ['Field1', 'Email'] + expected = [ + ('a', 'Email 1', 2), + ('c', 'Email 2', 1), + ] + actual = list(summarize.anonymize(group_by, original)) + self.assertEqual(expected, actual) diff --git a/goal_tools/who_helped/summarize.py b/goal_tools/who_helped/summarize.py index 65dd5fc..7f9740d 100644 --- a/goal_tools/who_helped/summarize.py +++ b/goal_tools/who_helped/summarize.py @@ -11,6 +11,7 @@ # under the License. import collections +import itertools import logging from goal_tools.who_helped import contributions @@ -33,6 +34,41 @@ def _count_distinct(by_names, to_count, data_source): return {k: len(v) for k, v in counts.items()} +class Anonymizer: + "Track unique values for a field while masking them." + + def __init__(self, field): + self.field = field + self.cache = {} + self.counter = itertools.count(1) + + def __repr__(self): + return 'Anonymizer({!r})'.format(self.field) + + def __call__(self, value): + if value not in self.cache: + anon = '{} {}'.format(self.field, next(self.counter)) + self.cache[value] = anon + return self.cache[value] + + +def anonymize(group_by, data): + "Turn the fields with identifying information into anonymous strings." + generators = { + 'Organization': Anonymizer('Organization'), + 'Name': Anonymizer('Name'), + 'Email': Anonymizer('Email'), + } + modifiers = [ + generators.get(field, lambda x: x) + for field in group_by + ] + modifiers.append(lambda x: x) # for the count field + for row in data: + new_row = tuple(m(r) for m, r in zip(modifiers, row)) + yield new_row + + class SummarizeContributions(report.ContributionsReportBase): "Summarize a contribution report." @@ -54,6 +90,13 @@ class SummarizeContributions(report.ContributionsReportBase): help=('combination of unique values to count ' '(may be repeated), defaults to counting each contribution'), ) + parser.add_argument( + '--anonymize', '--anon', + dest='anonymize', + default=False, + action='store_true', + help='mask organization and personal identifying information', + ) return parser def take_action(self, parsed_args): @@ -74,6 +117,9 @@ class SummarizeContributions(report.ContributionsReportBase): key=lambda x: (x[-1], x[:-1]), # by count first )) + if parsed_args.anonymize: + output_rows = anonymize(group_by, output_rows) + columns = tuple(group_by) + (to_count_column,) return (columns, output_rows)