perf: Custom uri.decode function
This patch replaces urllib.unquote_plus with code that is tuned for performance, and is more robust when it comes to decoding non-ASCII characters.
This commit is contained in:
@@ -1,3 +1,3 @@
|
||||
"""Falcon benchmarks"""
|
||||
|
||||
from bench import main # NOQA
|
||||
from falcon.bench.bench import main # NOQA
|
||||
|
||||
@@ -105,7 +105,7 @@ def avg(array):
|
||||
def hello_env():
|
||||
request_headers = {'Content-Type': 'application/json'}
|
||||
return helpers.create_environ('/hello/584/test',
|
||||
query_string='limit=10&thing=a%20b',
|
||||
query_string='limit=10&thing=ab',
|
||||
headers=request_headers)
|
||||
|
||||
|
||||
@@ -127,7 +127,8 @@ def run(frameworks, trials, iterations, stat_memory):
|
||||
for name in frameworks:
|
||||
try:
|
||||
create_bench(name, hello_env())
|
||||
except ImportError:
|
||||
except ImportError as ex:
|
||||
print(ex)
|
||||
print('Skipping missing library: ' + name)
|
||||
del frameworks[frameworks.index(name)]
|
||||
|
||||
@@ -167,7 +168,7 @@ def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description="Falcon benchmark runner")
|
||||
parser.add_argument('-b', '--benchmark', type=str, action='append',
|
||||
choices=frameworks, dest='frameworks')
|
||||
choices=frameworks, dest='frameworks', nargs='+')
|
||||
parser.add_argument('-i', '--iterations', type=int, default=50000)
|
||||
parser.add_argument('-t', '--trials', type=int, default=3)
|
||||
parser.add_argument('-p', '--profile', action='store_true')
|
||||
@@ -181,6 +182,16 @@ def main():
|
||||
if args.frameworks:
|
||||
frameworks = args.frameworks
|
||||
|
||||
# Normalize frameworks type
|
||||
normalized_frameworks = []
|
||||
for one_or_many in frameworks:
|
||||
if isinstance(one_or_many, list):
|
||||
normalized_frameworks.extend(one_or_many)
|
||||
else:
|
||||
normalized_frameworks.append(one_or_many)
|
||||
|
||||
frameworks = normalized_frameworks
|
||||
|
||||
# Profile?
|
||||
if args.profile:
|
||||
for name in frameworks:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from pecan import make_app
|
||||
|
||||
import controllers.root
|
||||
# from .controllers import root
|
||||
|
||||
|
||||
def create():
|
||||
|
||||
@@ -16,14 +16,10 @@ limitations under the License.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import six
|
||||
|
||||
if six.PY3: # pragma nocover
|
||||
import urllib.parse as urllib # pylint: disable=E0611
|
||||
else: # pragma nocover
|
||||
import urllib
|
||||
|
||||
|
||||
# NOTE(kgriffs): See also RFC 3986
|
||||
_UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
'abcdefghijklmnopqrstuvwxyz'
|
||||
@@ -32,9 +28,12 @@ _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
|
||||
# NOTE(kgriffs): See also RFC 3986
|
||||
_DELIMITERS = ":/?#[]@!$&'()*+,;="
|
||||
|
||||
_ALL_ALLOWED = _UNRESERVED + _DELIMITERS
|
||||
|
||||
_ESCAPE_SEQUENCE = re.compile(b'%..')
|
||||
_HEX_DIGITS = '0123456789ABCDEFabcdef'
|
||||
_UTF8_MAX = 127
|
||||
|
||||
|
||||
def _create_char_encoder(allowed_chars):
|
||||
|
||||
@@ -142,62 +141,114 @@ Returns:
|
||||
|
||||
"""
|
||||
|
||||
# NOTE(kgriffs): This is actually covered, but not in py33; hence the pragma
|
||||
if six.PY2: # pragma: no cover
|
||||
|
||||
def decode(uri):
|
||||
"""Decode any percent-encoded characters in a URI or query string.
|
||||
# This map construction is based on urllib
|
||||
_HEX_TO_BYTE = dict((a + b, (chr(int(a + b, 16)), int(a + b, 16)))
|
||||
for a in _HEX_DIGITS
|
||||
for b in _HEX_DIGITS)
|
||||
|
||||
uri.decode intends to model the behavior of
|
||||
urllib.parse.unquote_plus.
|
||||
def decode(encoded_uri):
|
||||
"""Decodes percent-encoded characters in a URI or query string.
|
||||
|
||||
Args:
|
||||
uri: An encoded URI (full or partial). If of type str on Python 2,
|
||||
UTF-8 is assumed.
|
||||
This function models the behavior of urllib.parse.unquote_plus, but
|
||||
is faster. It is also more robust, in that it will decode escaped
|
||||
UTF-8 mutibyte sequences.
|
||||
|
||||
Returns:
|
||||
A decoded URL. Will be of type `unicode` on Python 2 IFF `uri`
|
||||
contains percent-encoded chars (in which case there is a chance
|
||||
they might contain multibyte Unicode sequences).
|
||||
Args:
|
||||
encoded_uri: An encoded URI (full or partial).
|
||||
|
||||
"""
|
||||
Returns:
|
||||
A decoded URL. Will be of type `unicode` on Python 2 IFF the
|
||||
URL contained escaped non-ASCII characters, in which case UTF-8
|
||||
is assumed per RFC 3986.
|
||||
|
||||
encoded_uri = uri
|
||||
"""
|
||||
|
||||
#
|
||||
# TODO(kgriffs): urllib is broken when it comes to decoding
|
||||
# non-ASCII strings on Python 2. The problem is, if you pass
|
||||
# it a str type, it doesn't even try to decode the character
|
||||
# set. On the other hand, if you pass it a unicode type, urllib
|
||||
# simply decodes code points as latin1 (not exactly a sensible
|
||||
# default, eh?).
|
||||
#
|
||||
# So, we could just let urllib do its thing and after the fact
|
||||
# decode the result like so:
|
||||
#
|
||||
# if six.PY2 and isinstance(encoded_uri, str): # pragma nocover
|
||||
# encoded_uri = encoded_uri.decode('utf-8', 'replace')
|
||||
#
|
||||
# However, that adds several microseconds and will rarely be
|
||||
# needed by the caller who is probably just decoding a query
|
||||
# string, and it is not common to put non-ASCII characters in
|
||||
# a cloud API's query string (please contact me if I am wrong!).
|
||||
#
|
||||
decoded_uri = encoded_uri
|
||||
|
||||
# PERF(kgriffs): unquote_plus can do this, but if there are
|
||||
# *only* plusses in the string, no '%', we can save a lot of
|
||||
# time!
|
||||
if '+' in encoded_uri:
|
||||
encoded_uri = encoded_uri.replace('+', ' ')
|
||||
# PERF(kgriffs): Don't take the time to instantiate a new
|
||||
# string unless we have to.
|
||||
if '+' in decoded_uri:
|
||||
decoded_uri = decoded_uri.replace('+', ' ')
|
||||
|
||||
if '%' in encoded_uri:
|
||||
encoded_uri = urllib.unquote(encoded_uri)
|
||||
# Short-circuit if we can
|
||||
if '%' not in decoded_uri:
|
||||
return decoded_uri
|
||||
|
||||
# Convert to bytes because we are about to replace chars and we
|
||||
# don't want Python to mistakenly interpret any high bits.
|
||||
if not isinstance(decoded_uri, str):
|
||||
# NOTE(kgriffs): Clients should never submit a URI that has
|
||||
# unescaped non-ASCII chars in them, but just in case they
|
||||
# do, let's encode in a non-lossy format.
|
||||
decoded_uri = decoded_uri.encode('utf-8')
|
||||
|
||||
# PERF(kgriffs): Use a closure instead of a class.
|
||||
only_ascii = [True]
|
||||
|
||||
def unescape(matchobj):
|
||||
# NOTE(kgriffs): Strip '%' and convert the hex number
|
||||
char, byte = _HEX_TO_BYTE[matchobj.group(0)[1:]]
|
||||
only_ascii[0] = only_ascii[0] and (byte <= _UTF8_MAX)
|
||||
|
||||
return char
|
||||
|
||||
decoded_uri = _ESCAPE_SEQUENCE.sub(unescape, decoded_uri)
|
||||
|
||||
# PERF(kgriffs): Only spend the time to do this if there
|
||||
# were multibyte, UTF-8 encoded sequences that were
|
||||
# percent-encoded.
|
||||
if six.PY2 and isinstance(encoded_uri, str): # pragma nocover
|
||||
for byte in bytearray(encoded_uri):
|
||||
if byte > 127:
|
||||
encoded_uri = encoded_uri.decode('utf-8', 'replace')
|
||||
break
|
||||
# were non-ascii bytes found in the string.
|
||||
if not only_ascii[0]:
|
||||
decoded_uri = decoded_uri.decode('utf-8', 'replace')
|
||||
|
||||
return encoded_uri
|
||||
return decoded_uri
|
||||
|
||||
# NOTE(kgriffs): This is actually covered, but not in py2x; hence the pragma
|
||||
else: # pragma: no cover
|
||||
|
||||
# This map construction is based on urllib
|
||||
_HEX_TO_BYTE = dict(((a + b).encode(), bytes([int(a + b, 16)]))
|
||||
for a in _HEX_DIGITS
|
||||
for b in _HEX_DIGITS)
|
||||
|
||||
def _unescape(matchobj):
|
||||
# NOTE(kgriffs): Strip '%' and convert the hex number
|
||||
return _HEX_TO_BYTE[matchobj.group(0)[1:]]
|
||||
|
||||
def decode(encoded_uri):
|
||||
"""Decodes percent-encoded characters in a URI or query string.
|
||||
|
||||
This function models the behavior of urllib.parse.unquote_plus,
|
||||
albeit in a faster, more straightforward manner.
|
||||
|
||||
Args:
|
||||
encoded_uri: An encoded URI (full or partial).
|
||||
|
||||
Returns:
|
||||
A decoded URL. If the URL contains escaped non-ASCII
|
||||
characters, UTF-8 is assumed per RFC 3986.
|
||||
|
||||
"""
|
||||
|
||||
decoded_uri = encoded_uri
|
||||
|
||||
# PERF(kgriffs): Don't take the time to instantiate a new
|
||||
# string unless we have to.
|
||||
if '+' in decoded_uri:
|
||||
decoded_uri = decoded_uri.replace('+', ' ')
|
||||
|
||||
# Short-circuit if we can
|
||||
if '%' not in decoded_uri:
|
||||
return decoded_uri
|
||||
|
||||
# NOTE(kgriffs): Clients should never submit a URI that has
|
||||
# unescaped non-ASCII chars in them, but just in case they
|
||||
# do, let's encode into a non-lossy format.
|
||||
decoded_uri = decoded_uri.encode('utf-8')
|
||||
|
||||
# Replace escape sequences
|
||||
decoded_uri = _ESCAPE_SEQUENCE.sub(_unescape, decoded_uri)
|
||||
|
||||
# Back to str
|
||||
return decoded_uri.decode('utf-8', 'replace')
|
||||
|
||||
@@ -110,6 +110,8 @@ class TestFalconUtils(testtools.TestCase):
|
||||
self.assertEqual(uri.encode_value(u'abcd'), u'abcd')
|
||||
self.assertEqual(uri.encode_value(u'ab cd'), u'ab%20cd')
|
||||
self.assertEqual(uri.encode_value(u'\u00e7'), '%C3%A7')
|
||||
self.assertEqual(uri.encode_value(u'\u00e7\u20ac'),
|
||||
'%C3%A7%E2%82%AC')
|
||||
self.assertEqual(uri.encode_value('ab/cd'), 'ab%2Fcd')
|
||||
self.assertEqual(uri.encode_value('ab+cd=42,9'),
|
||||
'ab%2Bcd%3D42%2C9')
|
||||
@@ -118,7 +120,13 @@ class TestFalconUtils(testtools.TestCase):
|
||||
self.assertEqual(uri.decode('abcd'), 'abcd')
|
||||
self.assertEqual(uri.decode(u'abcd'), u'abcd')
|
||||
self.assertEqual(uri.decode(u'ab%20cd'), u'ab cd')
|
||||
self.assertEqual(uri.decode('%C3%A7'), u'\u00e7')
|
||||
|
||||
self.assertEqual(uri.decode('This thing is %C3%A7'),
|
||||
u'This thing is \u00e7')
|
||||
|
||||
self.assertEqual(uri.decode('This thing is %C3%A7%E2%82%AC'),
|
||||
u'This thing is \u00e7\u20ac')
|
||||
|
||||
self.assertEqual(uri.decode('ab%2Fcd'), 'ab/cd')
|
||||
|
||||
self.assertEqual(uri.decode('http://example.com?x=ab%2Bcd%3D42%2C9'),
|
||||
@@ -145,6 +153,8 @@ class TestFalconUtils(testtools.TestCase):
|
||||
def test_prop_uri_decode_models_stdlib_unquote_plus(self):
|
||||
stdlib_unquote = six.moves.urllib.parse.unquote_plus
|
||||
for case in self.uris:
|
||||
case = uri.encode_value(case)
|
||||
|
||||
expect = stdlib_unquote(case)
|
||||
actual = uri.decode(case)
|
||||
self.assertEqual(expect, actual)
|
||||
|
||||
Reference in New Issue
Block a user