Improve performance of strutils.mask_password

Only apply substitution patterns related to key values that appear in
the string.

Replace .*? in patterns with more explicit patterns that don't require
backtracing.

Add a performance test script for future testing work, with references
to large data files that can be downloaded but that we don't want to
check into the source repository because of their sizes.

Change-Id: Ic3ed252d181c93b8a0db465db6c8c4a7ca97da42
Related-bug: #1408362
This commit is contained in:
Doug Hellmann 2015-01-07 14:12:26 -05:00
parent ca76fdcb52
commit 6e0b86164b
2 changed files with 79 additions and 21 deletions

View File

@ -58,30 +58,37 @@ _SANITIZE_KEYS = ['adminPass', 'admin_pass', 'password', 'admin_password',
# _SANITIZE_KEYS we already have. This way, we only have to add the new key
# to the list of _SANITIZE_KEYS and we can generate regular expressions
# for XML and JSON automatically.
_SANITIZE_PATTERNS_2 = []
_SANITIZE_PATTERNS_1 = []
_SANITIZE_PATTERNS_2 = {}
_SANITIZE_PATTERNS_1 = {}
# NOTE(amrith): Some regular expressions have only one parameter, some
# have two parameters. Use different lists of patterns here.
_FORMAT_PATTERNS_1 = [r'(%(key)s\s*[=]\s*)[^\s^\'^\"]+']
_FORMAT_PATTERNS_2 = [r'(%(key)s\s*[=]\s*[\"\']).*?([\"\'])',
r'(%(key)s\s+[\"\']).*?([\"\'])',
_FORMAT_PATTERNS_2 = [r'(%(key)s\s*[=]\s*[\"\'])[^\"\']*([\"\'])',
r'(%(key)s\s+[\"\'])[^\"\']*([\"\'])',
r'([-]{2}%(key)s\s+)[^\'^\"^=^\s]+([\s]*)',
r'(<%(key)s>).*?(</%(key)s>)',
r'([\"\']%(key)s[\"\']\s*:\s*[\"\']).*?([\"\'])',
r'([\'"].*?%(key)s[\'"]\s*:\s*u?[\'"]).*?([\'"])',
r'([\'"].*?%(key)s[\'"]\s*,\s*\'--?[A-z]+\'\s*,\s*u?'
'[\'"]).*?([\'"])',
r'(<%(key)s>)[^<]*(</%(key)s>)',
r'([\"\']%(key)s[\"\']\s*:\s*[\"\'])[^\"\']*([\"\'])',
r'([\'"][^"\']*%(key)s[\'"]\s*:\s*u?[\'"])[^\"\']*'
'([\'"])',
r'([\'"][^\'"]*%(key)s[\'"]\s*,\s*\'--?[A-z]+\'\s*,\s*u?'
'[\'"])[^\"\']*([\'"])',
r'(%(key)s\s*--?[A-z]+\s*)\S+(\s*)']
# NOTE(dhellmann): Keep a separate list of patterns by key so we only
# need to apply the substitutions for keys we find using a quick "in"
# test.
for key in _SANITIZE_KEYS:
_SANITIZE_PATTERNS_1[key] = []
_SANITIZE_PATTERNS_2[key] = []
for pattern in _FORMAT_PATTERNS_2:
reg_ex = re.compile(pattern % {'key': key}, re.DOTALL)
_SANITIZE_PATTERNS_2.append(reg_ex)
_SANITIZE_PATTERNS_2[key].append(reg_ex)
for pattern in _FORMAT_PATTERNS_1:
reg_ex = re.compile(pattern % {'key': key}, re.DOTALL)
_SANITIZE_PATTERNS_1.append(reg_ex)
_SANITIZE_PATTERNS_1[key].append(reg_ex)
def int_from_bool_as_string(subject):
@ -230,19 +237,18 @@ def mask_password(message, secret="***"):
# byte string. A better solution will be provided in Kilo.
pass
substitute1 = r'\g<1>' + secret
substitute2 = r'\g<1>' + secret + r'\g<2>'
# NOTE(ldbragst): Check to see if anything in message contains any key
# specified in _SANITIZE_KEYS, if not then just return the message since
# we don't have to mask any passwords.
if not any(key in message for key in _SANITIZE_KEYS):
return message
substitute = r'\g<1>' + secret + r'\g<2>'
for pattern in _SANITIZE_PATTERNS_2:
message = re.sub(pattern, substitute, message)
substitute = r'\g<1>' + secret
for pattern in _SANITIZE_PATTERNS_1:
message = re.sub(pattern, substitute, message)
for key in _SANITIZE_KEYS:
if key in message:
for pattern in _SANITIZE_PATTERNS_2[key]:
message = re.sub(pattern, substitute2, message)
for pattern in _SANITIZE_PATTERNS_1[key]:
message = re.sub(pattern, substitute1, message)
return message

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Performance tests for mask_password.
"""
from __future__ import print_function
import timeit
from oslo_utils import strutils
# A moderately sized input (~50K) string
# http://paste.openstack.org/raw/155864/
# infile = '155864.txt'
# Untruncated version of the above (~310K)
# http://dl.sileht.net/public/payload.json.gz
infile = 'large_json_payload.txt'
with open(infile, 'r') as f:
input_str = f.read()
print('payload has %d bytes' % len(input_str))
for pattern in strutils._SANITIZE_PATTERNS_2['admin_pass']:
print('\ntesting %s' % pattern.pattern)
t = timeit.Timer(
"re.sub(pattern, r'\g<1>***\g<2>', payload)",
"""
import re
payload = '''%s'''
pattern = re.compile(r'''%s''')
""" % (input_str, pattern.pattern))
print(t.timeit(1))
t = timeit.Timer(
"strutils.mask_password('''" + input_str + "''')",
"from oslo_utils import strutils",
)
print(t.timeit(1))