add --anonymize flag to the summarize command

This flag can help us look at numbers without being distracted by who is making the contributions, which is useful for understanding if a single organization is contributing disproportionately relative to others. Change-Id: I6fcaf4cc19441f162ec6b9c6f3ee8bd780878a76 Signed-off-by: Doug Hellmann <doug@doughellmann.com>
2018-05-01 17:30:52 -04:00 · 2018-05-01 17:30:52 -04:00 · b9aae47c6c
commit b9aae47c6c
parent f9c70b6751
2 changed files with 101 additions and 0 deletions
--- a/goal_tools/tests/test_summarize.py
+++ b/goal_tools/tests/test_summarize.py
@ -64,3 +64,58 @@ class TestSummarizeBy(base.TestCase):
            ('A',): 2,
        }
        self.assertEqual(expected, results)
 class TestAnonymize(base.TestCase):
    def test_anonymizer(self):
        a = summarize.Anonymizer('Field')
        self.assertEqual('Field 1', a('anything'))
        self.assertEqual('Field 1', a('anything'))
        self.assertEqual('Field 2', a('anything else'))
        self.assertEqual('Field 2', a('anything else'))
    def test_not_needed(self):
        original = [('a', 'b', 1)]
        group_by = ('Field1', 'Field2')
        actual = list(summarize.anonymize(group_by, original))
        self.assertEqual(original, actual)
    def test_organization(self):
        original = [
            ('a', 'b', 2),
            ('c', 'd', 1),
        ]
        group_by = ['Organization', 'Field2']
        expected = [
            ('Organization 1', 'b', 2),
            ('Organization 2', 'd', 1),
        ]
        actual = list(summarize.anonymize(group_by, original))
        self.assertEqual(expected, actual)
    def test_name(self):
        original = [
            ('a', 'b', 2),
            ('c', 'd', 1),
        ]
        group_by = ['Field1', 'Name']
        expected = [
            ('a', 'Name 1', 2),
            ('c', 'Name 2', 1),
        ]
        actual = list(summarize.anonymize(group_by, original))
        self.assertEqual(expected, actual)
    def test_email(self):
        original = [
            ('a', 'b', 2),
            ('c', 'd', 1),
        ]
        group_by = ['Field1', 'Email']
        expected = [
            ('a', 'Email 1', 2),
            ('c', 'Email 2', 1),
        ]
        actual = list(summarize.anonymize(group_by, original))
        self.assertEqual(expected, actual)
--- a/goal_tools/who_helped/summarize.py
+++ b/goal_tools/who_helped/summarize.py
@ -11,6 +11,7 @@
 # under the License.
 import collections
 import itertools
 import logging
 from goal_tools.who_helped import contributions
@ -33,6 +34,41 @@ def _count_distinct(by_names, to_count, data_source):
    return {k: len(v) for k, v in counts.items()}
 class Anonymizer:
    "Track unique values for a field while masking them."
    def __init__(self, field):
        self.field = field
        self.cache = {}
        self.counter = itertools.count(1)
    def __repr__(self):
        return 'Anonymizer({!r})'.format(self.field)
    def __call__(self, value):
        if value not in self.cache:
            anon = '{} {}'.format(self.field, next(self.counter))
            self.cache[value] = anon
        return self.cache[value]
 def anonymize(group_by, data):
    "Turn the fields with identifying information into anonymous strings."
    generators = {
        'Organization': Anonymizer('Organization'),
        'Name': Anonymizer('Name'),
        'Email': Anonymizer('Email'),
    }
    modifiers = [
        generators.get(field, lambda x: x)
        for field in group_by
    ]
    modifiers.append(lambda x: x)  # for the count field
    for row in data:
        new_row = tuple(m(r) for m, r in zip(modifiers, row))
        yield new_row
 class SummarizeContributions(report.ContributionsReportBase):
    "Summarize a contribution report."
@ -54,6 +90,13 @@ class SummarizeContributions(report.ContributionsReportBase):
            help=('combination of unique values to count '
                  '(may be repeated), defaults to counting each contribution'),
        )
        parser.add_argument(
            '--anonymize', '--anon',
            dest='anonymize',
            default=False,
            action='store_true',
            help='mask organization and personal identifying information',
        )
        return parser
    def take_action(self, parsed_args):
@ -74,6 +117,9 @@ class SummarizeContributions(report.ContributionsReportBase):
            key=lambda x: (x[-1], x[:-1]),  # by count first
        ))
        if parsed_args.anonymize:
            output_rows = anonymize(group_by, output_rows)
        columns = tuple(group_by) + (to_count_column,)
        return (columns, output_rows)