add --anonymize flag to the summarize command

This flag can help us look at numbers without being distracted by who is making the contributions, which is useful for understanding if a single organization is contributing disproportionately relative to others. Change-Id: I6fcaf4cc19441f162ec6b9c6f3ee8bd780878a76 Signed-off-by: Doug Hellmann <doug@doughellmann.com>
2018-05-01 17:30:52 -04:00 · 2018-05-01 17:30:52 -04:00 · b9aae47c6c
commit b9aae47c6c
parent f9c70b6751
2 changed files with 101 additions and 0 deletions
--- a/goal_tools/tests/test_summarize.py
+++ b/goal_tools/tests/test_summarize.py
@ -64,3 +64,58 @@ class TestSummarizeBy(base.TestCase):
            ('A',): 2,
        }
        self.assertEqual(expected, results)
+
+
+class TestAnonymize(base.TestCase):
+
+    def test_anonymizer(self):
+        a = summarize.Anonymizer('Field')
+        self.assertEqual('Field 1', a('anything'))
+        self.assertEqual('Field 1', a('anything'))
+        self.assertEqual('Field 2', a('anything else'))
+        self.assertEqual('Field 2', a('anything else'))
+
+    def test_not_needed(self):
+        original = [('a', 'b', 1)]
+        group_by = ('Field1', 'Field2')
+        actual = list(summarize.anonymize(group_by, original))
+        self.assertEqual(original, actual)
+
+    def test_organization(self):
+        original = [
+            ('a', 'b', 2),
+            ('c', 'd', 1),
+        ]
+        group_by = ['Organization', 'Field2']
+        expected = [
+            ('Organization 1', 'b', 2),
+            ('Organization 2', 'd', 1),
+        ]
+        actual = list(summarize.anonymize(group_by, original))
+        self.assertEqual(expected, actual)
+
+    def test_name(self):
+        original = [
+            ('a', 'b', 2),
+            ('c', 'd', 1),
+        ]
+        group_by = ['Field1', 'Name']
+        expected = [
+            ('a', 'Name 1', 2),
+            ('c', 'Name 2', 1),
+        ]
+        actual = list(summarize.anonymize(group_by, original))
+        self.assertEqual(expected, actual)
+
+    def test_email(self):
+        original = [
+            ('a', 'b', 2),
+            ('c', 'd', 1),
+        ]
+        group_by = ['Field1', 'Email']
+        expected = [
+            ('a', 'Email 1', 2),
+            ('c', 'Email 2', 1),
+        ]
+        actual = list(summarize.anonymize(group_by, original))
+        self.assertEqual(expected, actual)
--- a/goal_tools/who_helped/summarize.py
+++ b/goal_tools/who_helped/summarize.py
@ -11,6 +11,7 @@
 # under the License.

 import collections
+import itertools
 import logging

 from goal_tools.who_helped import contributions
@ -33,6 +34,41 @@ def _count_distinct(by_names, to_count, data_source):
    return {k: len(v) for k, v in counts.items()}


+class Anonymizer:
+    "Track unique values for a field while masking them."
+
+    def __init__(self, field):
+        self.field = field
+        self.cache = {}
+        self.counter = itertools.count(1)
+
+    def __repr__(self):
+        return 'Anonymizer({!r})'.format(self.field)
+
+    def __call__(self, value):
+        if value not in self.cache:
+            anon = '{} {}'.format(self.field, next(self.counter))
+            self.cache[value] = anon
+        return self.cache[value]
+
+
+def anonymize(group_by, data):
+    "Turn the fields with identifying information into anonymous strings."
+    generators = {
+        'Organization': Anonymizer('Organization'),
+        'Name': Anonymizer('Name'),
+        'Email': Anonymizer('Email'),
+    }
+    modifiers = [
+        generators.get(field, lambda x: x)
+        for field in group_by
+    ]
+    modifiers.append(lambda x: x)  # for the count field
+    for row in data:
+        new_row = tuple(m(r) for m, r in zip(modifiers, row))
+        yield new_row
+
+
 class SummarizeContributions(report.ContributionsReportBase):
    "Summarize a contribution report."

@ -54,6 +90,13 @@ class SummarizeContributions(report.ContributionsReportBase):
            help=('combination of unique values to count '
                  '(may be repeated), defaults to counting each contribution'),
        )
+        parser.add_argument(
+            '--anonymize', '--anon',
+            dest='anonymize',
+            default=False,
+            action='store_true',
+            help='mask organization and personal identifying information',
+        )
        return parser

    def take_action(self, parsed_args):
@ -74,6 +117,9 @@ class SummarizeContributions(report.ContributionsReportBase):
            key=lambda x: (x[-1], x[:-1]),  # by count first
        ))

+        if parsed_args.anonymize:
+            output_rows = anonymize(group_by, output_rows)
+
        columns = tuple(group_by) + (to_count_column,)

        return (columns, output_rows)