add --anonymize flag to the summarize command

This flag can help us look at numbers without being distracted by who
is making the contributions, which is useful for understanding if a
single organization is contributing disproportionately relative to
others.

Change-Id: I6fcaf4cc19441f162ec6b9c6f3ee8bd780878a76
Signed-off-by: Doug Hellmann <doug@doughellmann.com>
This commit is contained in:
Doug Hellmann 2018-05-01 17:30:52 -04:00
parent f9c70b6751
commit b9aae47c6c
2 changed files with 101 additions and 0 deletions

View File

@ -64,3 +64,58 @@ class TestSummarizeBy(base.TestCase):
('A',): 2, ('A',): 2,
} }
self.assertEqual(expected, results) self.assertEqual(expected, results)
class TestAnonymize(base.TestCase):
def test_anonymizer(self):
a = summarize.Anonymizer('Field')
self.assertEqual('Field 1', a('anything'))
self.assertEqual('Field 1', a('anything'))
self.assertEqual('Field 2', a('anything else'))
self.assertEqual('Field 2', a('anything else'))
def test_not_needed(self):
original = [('a', 'b', 1)]
group_by = ('Field1', 'Field2')
actual = list(summarize.anonymize(group_by, original))
self.assertEqual(original, actual)
def test_organization(self):
original = [
('a', 'b', 2),
('c', 'd', 1),
]
group_by = ['Organization', 'Field2']
expected = [
('Organization 1', 'b', 2),
('Organization 2', 'd', 1),
]
actual = list(summarize.anonymize(group_by, original))
self.assertEqual(expected, actual)
def test_name(self):
original = [
('a', 'b', 2),
('c', 'd', 1),
]
group_by = ['Field1', 'Name']
expected = [
('a', 'Name 1', 2),
('c', 'Name 2', 1),
]
actual = list(summarize.anonymize(group_by, original))
self.assertEqual(expected, actual)
def test_email(self):
original = [
('a', 'b', 2),
('c', 'd', 1),
]
group_by = ['Field1', 'Email']
expected = [
('a', 'Email 1', 2),
('c', 'Email 2', 1),
]
actual = list(summarize.anonymize(group_by, original))
self.assertEqual(expected, actual)

View File

@ -11,6 +11,7 @@
# under the License. # under the License.
import collections import collections
import itertools
import logging import logging
from goal_tools.who_helped import contributions from goal_tools.who_helped import contributions
@ -33,6 +34,41 @@ def _count_distinct(by_names, to_count, data_source):
return {k: len(v) for k, v in counts.items()} return {k: len(v) for k, v in counts.items()}
class Anonymizer:
"Track unique values for a field while masking them."
def __init__(self, field):
self.field = field
self.cache = {}
self.counter = itertools.count(1)
def __repr__(self):
return 'Anonymizer({!r})'.format(self.field)
def __call__(self, value):
if value not in self.cache:
anon = '{} {}'.format(self.field, next(self.counter))
self.cache[value] = anon
return self.cache[value]
def anonymize(group_by, data):
"Turn the fields with identifying information into anonymous strings."
generators = {
'Organization': Anonymizer('Organization'),
'Name': Anonymizer('Name'),
'Email': Anonymizer('Email'),
}
modifiers = [
generators.get(field, lambda x: x)
for field in group_by
]
modifiers.append(lambda x: x) # for the count field
for row in data:
new_row = tuple(m(r) for m, r in zip(modifiers, row))
yield new_row
class SummarizeContributions(report.ContributionsReportBase): class SummarizeContributions(report.ContributionsReportBase):
"Summarize a contribution report." "Summarize a contribution report."
@ -54,6 +90,13 @@ class SummarizeContributions(report.ContributionsReportBase):
help=('combination of unique values to count ' help=('combination of unique values to count '
'(may be repeated), defaults to counting each contribution'), '(may be repeated), defaults to counting each contribution'),
) )
parser.add_argument(
'--anonymize', '--anon',
dest='anonymize',
default=False,
action='store_true',
help='mask organization and personal identifying information',
)
return parser return parser
def take_action(self, parsed_args): def take_action(self, parsed_args):
@ -74,6 +117,9 @@ class SummarizeContributions(report.ContributionsReportBase):
key=lambda x: (x[-1], x[:-1]), # by count first key=lambda x: (x[-1], x[:-1]), # by count first
)) ))
if parsed_args.anonymize:
output_rows = anonymize(group_by, output_rows)
columns = tuple(group_by) + (to_count_column,) columns = tuple(group_by) + (to_count_column,)
return (columns, output_rows) return (columns, output_rows)