From 6e0b86164b955a7930cd39dd3c12574f25f1a14e Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Wed, 7 Jan 2015 14:12:26 -0500 Subject: [PATCH] Improve performance of strutils.mask_password Only apply substitution patterns related to key values that appear in the string. Replace .*? in patterns with more explicit patterns that don't require backtracing. Add a performance test script for future testing work, with references to large data files that can be downloaded but that we don't want to check into the source repository because of their sizes. Change-Id: Ic3ed252d181c93b8a0db465db6c8c4a7ca97da42 Related-bug: #1408362 --- oslo_utils/strutils.py | 48 ++++++++++++++++------------- tools/perf_test_mask_password.py | 52 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 21 deletions(-) create mode 100644 tools/perf_test_mask_password.py diff --git a/oslo_utils/strutils.py b/oslo_utils/strutils.py index 782903d0..2851ced8 100644 --- a/oslo_utils/strutils.py +++ b/oslo_utils/strutils.py @@ -58,30 +58,37 @@ _SANITIZE_KEYS = ['adminPass', 'admin_pass', 'password', 'admin_password', # _SANITIZE_KEYS we already have. This way, we only have to add the new key # to the list of _SANITIZE_KEYS and we can generate regular expressions # for XML and JSON automatically. -_SANITIZE_PATTERNS_2 = [] -_SANITIZE_PATTERNS_1 = [] +_SANITIZE_PATTERNS_2 = {} +_SANITIZE_PATTERNS_1 = {} # NOTE(amrith): Some regular expressions have only one parameter, some # have two parameters. Use different lists of patterns here. _FORMAT_PATTERNS_1 = [r'(%(key)s\s*[=]\s*)[^\s^\'^\"]+'] -_FORMAT_PATTERNS_2 = [r'(%(key)s\s*[=]\s*[\"\']).*?([\"\'])', - r'(%(key)s\s+[\"\']).*?([\"\'])', +_FORMAT_PATTERNS_2 = [r'(%(key)s\s*[=]\s*[\"\'])[^\"\']*([\"\'])', + r'(%(key)s\s+[\"\'])[^\"\']*([\"\'])', r'([-]{2}%(key)s\s+)[^\'^\"^=^\s]+([\s]*)', - r'(<%(key)s>).*?()', - r'([\"\']%(key)s[\"\']\s*:\s*[\"\']).*?([\"\'])', - r'([\'"].*?%(key)s[\'"]\s*:\s*u?[\'"]).*?([\'"])', - r'([\'"].*?%(key)s[\'"]\s*,\s*\'--?[A-z]+\'\s*,\s*u?' - '[\'"]).*?([\'"])', + r'(<%(key)s>)[^<]*()', + r'([\"\']%(key)s[\"\']\s*:\s*[\"\'])[^\"\']*([\"\'])', + r'([\'"][^"\']*%(key)s[\'"]\s*:\s*u?[\'"])[^\"\']*' + '([\'"])', + r'([\'"][^\'"]*%(key)s[\'"]\s*,\s*\'--?[A-z]+\'\s*,\s*u?' + '[\'"])[^\"\']*([\'"])', r'(%(key)s\s*--?[A-z]+\s*)\S+(\s*)'] +# NOTE(dhellmann): Keep a separate list of patterns by key so we only +# need to apply the substitutions for keys we find using a quick "in" +# test. for key in _SANITIZE_KEYS: + _SANITIZE_PATTERNS_1[key] = [] + _SANITIZE_PATTERNS_2[key] = [] + for pattern in _FORMAT_PATTERNS_2: reg_ex = re.compile(pattern % {'key': key}, re.DOTALL) - _SANITIZE_PATTERNS_2.append(reg_ex) + _SANITIZE_PATTERNS_2[key].append(reg_ex) for pattern in _FORMAT_PATTERNS_1: reg_ex = re.compile(pattern % {'key': key}, re.DOTALL) - _SANITIZE_PATTERNS_1.append(reg_ex) + _SANITIZE_PATTERNS_1[key].append(reg_ex) def int_from_bool_as_string(subject): @@ -230,19 +237,18 @@ def mask_password(message, secret="***"): # byte string. A better solution will be provided in Kilo. pass + substitute1 = r'\g<1>' + secret + substitute2 = r'\g<1>' + secret + r'\g<2>' + # NOTE(ldbragst): Check to see if anything in message contains any key # specified in _SANITIZE_KEYS, if not then just return the message since # we don't have to mask any passwords. - if not any(key in message for key in _SANITIZE_KEYS): - return message - - substitute = r'\g<1>' + secret + r'\g<2>' - for pattern in _SANITIZE_PATTERNS_2: - message = re.sub(pattern, substitute, message) - - substitute = r'\g<1>' + secret - for pattern in _SANITIZE_PATTERNS_1: - message = re.sub(pattern, substitute, message) + for key in _SANITIZE_KEYS: + if key in message: + for pattern in _SANITIZE_PATTERNS_2[key]: + message = re.sub(pattern, substitute2, message) + for pattern in _SANITIZE_PATTERNS_1[key]: + message = re.sub(pattern, substitute1, message) return message diff --git a/tools/perf_test_mask_password.py b/tools/perf_test_mask_password.py new file mode 100644 index 00000000..f69a734d --- /dev/null +++ b/tools/perf_test_mask_password.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Performance tests for mask_password. +""" + +from __future__ import print_function + +import timeit + +from oslo_utils import strutils + +# A moderately sized input (~50K) string +# http://paste.openstack.org/raw/155864/ +# infile = '155864.txt' + +# Untruncated version of the above (~310K) +# http://dl.sileht.net/public/payload.json.gz +infile = 'large_json_payload.txt' + +with open(infile, 'r') as f: + input_str = f.read() +print('payload has %d bytes' % len(input_str)) + +for pattern in strutils._SANITIZE_PATTERNS_2['admin_pass']: + print('\ntesting %s' % pattern.pattern) + t = timeit.Timer( + "re.sub(pattern, r'\g<1>***\g<2>', payload)", + """ +import re +payload = '''%s''' +pattern = re.compile(r'''%s''') +""" % (input_str, pattern.pattern)) + print(t.timeit(1)) + +t = timeit.Timer( + "strutils.mask_password('''" + input_str + "''')", + "from oslo_utils import strutils", +) +print(t.timeit(1))