diff --git a/falcon/bench/__init__.py b/falcon/bench/__init__.py index 5685f72..ca7d7f2 100644 --- a/falcon/bench/__init__.py +++ b/falcon/bench/__init__.py @@ -1,3 +1,3 @@ """Falcon benchmarks""" -from bench import main # NOQA +from falcon.bench.bench import main # NOQA diff --git a/falcon/bench/bench.py b/falcon/bench/bench.py index d3dc74e..72e490c 100755 --- a/falcon/bench/bench.py +++ b/falcon/bench/bench.py @@ -105,7 +105,7 @@ def avg(array): def hello_env(): request_headers = {'Content-Type': 'application/json'} return helpers.create_environ('/hello/584/test', - query_string='limit=10&thing=a%20b', + query_string='limit=10&thing=ab', headers=request_headers) @@ -127,7 +127,8 @@ def run(frameworks, trials, iterations, stat_memory): for name in frameworks: try: create_bench(name, hello_env()) - except ImportError: + except ImportError as ex: + print(ex) print('Skipping missing library: ' + name) del frameworks[frameworks.index(name)] @@ -167,7 +168,7 @@ def main(): parser = argparse.ArgumentParser(description="Falcon benchmark runner") parser.add_argument('-b', '--benchmark', type=str, action='append', - choices=frameworks, dest='frameworks') + choices=frameworks, dest='frameworks', nargs='+') parser.add_argument('-i', '--iterations', type=int, default=50000) parser.add_argument('-t', '--trials', type=int, default=3) parser.add_argument('-p', '--profile', action='store_true') @@ -181,6 +182,16 @@ def main(): if args.frameworks: frameworks = args.frameworks + # Normalize frameworks type + normalized_frameworks = [] + for one_or_many in frameworks: + if isinstance(one_or_many, list): + normalized_frameworks.extend(one_or_many) + else: + normalized_frameworks.append(one_or_many) + + frameworks = normalized_frameworks + # Profile? if args.profile: for name in frameworks: diff --git a/falcon/bench/nuts/nuts/app.py b/falcon/bench/nuts/nuts/app.py index 0f3eeff..31f6841 100644 --- a/falcon/bench/nuts/nuts/app.py +++ b/falcon/bench/nuts/nuts/app.py @@ -1,6 +1,6 @@ from pecan import make_app -import controllers.root +# from .controllers import root def create(): diff --git a/falcon/util/uri.py b/falcon/util/uri.py index da2641b..9b64c7e 100644 --- a/falcon/util/uri.py +++ b/falcon/util/uri.py @@ -16,14 +16,10 @@ limitations under the License. """ +import re + import six -if six.PY3: # pragma nocover - import urllib.parse as urllib # pylint: disable=E0611 -else: # pragma nocover - import urllib - - # NOTE(kgriffs): See also RFC 3986 _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' @@ -32,9 +28,12 @@ _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' # NOTE(kgriffs): See also RFC 3986 _DELIMITERS = ":/?#[]@!$&'()*+,;=" - _ALL_ALLOWED = _UNRESERVED + _DELIMITERS +_ESCAPE_SEQUENCE = re.compile(b'%..') +_HEX_DIGITS = '0123456789ABCDEFabcdef' +_UTF8_MAX = 127 + def _create_char_encoder(allowed_chars): @@ -142,62 +141,114 @@ Returns: """ +# NOTE(kgriffs): This is actually covered, but not in py33; hence the pragma +if six.PY2: # pragma: no cover -def decode(uri): - """Decode any percent-encoded characters in a URI or query string. + # This map construction is based on urllib + _HEX_TO_BYTE = dict((a + b, (chr(int(a + b, 16)), int(a + b, 16))) + for a in _HEX_DIGITS + for b in _HEX_DIGITS) - uri.decode intends to model the behavior of - urllib.parse.unquote_plus. + def decode(encoded_uri): + """Decodes percent-encoded characters in a URI or query string. - Args: - uri: An encoded URI (full or partial). If of type str on Python 2, - UTF-8 is assumed. + This function models the behavior of urllib.parse.unquote_plus, but + is faster. It is also more robust, in that it will decode escaped + UTF-8 mutibyte sequences. - Returns: - A decoded URL. Will be of type `unicode` on Python 2 IFF `uri` - contains percent-encoded chars (in which case there is a chance - they might contain multibyte Unicode sequences). + Args: + encoded_uri: An encoded URI (full or partial). - """ + Returns: + A decoded URL. Will be of type `unicode` on Python 2 IFF the + URL contained escaped non-ASCII characters, in which case UTF-8 + is assumed per RFC 3986. - encoded_uri = uri + """ - # - # TODO(kgriffs): urllib is broken when it comes to decoding - # non-ASCII strings on Python 2. The problem is, if you pass - # it a str type, it doesn't even try to decode the character - # set. On the other hand, if you pass it a unicode type, urllib - # simply decodes code points as latin1 (not exactly a sensible - # default, eh?). - # - # So, we could just let urllib do its thing and after the fact - # decode the result like so: - # - # if six.PY2 and isinstance(encoded_uri, str): # pragma nocover - # encoded_uri = encoded_uri.decode('utf-8', 'replace') - # - # However, that adds several microseconds and will rarely be - # needed by the caller who is probably just decoding a query - # string, and it is not common to put non-ASCII characters in - # a cloud API's query string (please contact me if I am wrong!). - # + decoded_uri = encoded_uri - # PERF(kgriffs): unquote_plus can do this, but if there are - # *only* plusses in the string, no '%', we can save a lot of - # time! - if '+' in encoded_uri: - encoded_uri = encoded_uri.replace('+', ' ') + # PERF(kgriffs): Don't take the time to instantiate a new + # string unless we have to. + if '+' in decoded_uri: + decoded_uri = decoded_uri.replace('+', ' ') - if '%' in encoded_uri: - encoded_uri = urllib.unquote(encoded_uri) + # Short-circuit if we can + if '%' not in decoded_uri: + return decoded_uri + + # Convert to bytes because we are about to replace chars and we + # don't want Python to mistakenly interpret any high bits. + if not isinstance(decoded_uri, str): + # NOTE(kgriffs): Clients should never submit a URI that has + # unescaped non-ASCII chars in them, but just in case they + # do, let's encode in a non-lossy format. + decoded_uri = decoded_uri.encode('utf-8') + + # PERF(kgriffs): Use a closure instead of a class. + only_ascii = [True] + + def unescape(matchobj): + # NOTE(kgriffs): Strip '%' and convert the hex number + char, byte = _HEX_TO_BYTE[matchobj.group(0)[1:]] + only_ascii[0] = only_ascii[0] and (byte <= _UTF8_MAX) + + return char + + decoded_uri = _ESCAPE_SEQUENCE.sub(unescape, decoded_uri) # PERF(kgriffs): Only spend the time to do this if there - # were multibyte, UTF-8 encoded sequences that were - # percent-encoded. - if six.PY2 and isinstance(encoded_uri, str): # pragma nocover - for byte in bytearray(encoded_uri): - if byte > 127: - encoded_uri = encoded_uri.decode('utf-8', 'replace') - break + # were non-ascii bytes found in the string. + if not only_ascii[0]: + decoded_uri = decoded_uri.decode('utf-8', 'replace') - return encoded_uri + return decoded_uri + +# NOTE(kgriffs): This is actually covered, but not in py2x; hence the pragma +else: # pragma: no cover + + # This map construction is based on urllib + _HEX_TO_BYTE = dict(((a + b).encode(), bytes([int(a + b, 16)])) + for a in _HEX_DIGITS + for b in _HEX_DIGITS) + + def _unescape(matchobj): + # NOTE(kgriffs): Strip '%' and convert the hex number + return _HEX_TO_BYTE[matchobj.group(0)[1:]] + + def decode(encoded_uri): + """Decodes percent-encoded characters in a URI or query string. + + This function models the behavior of urllib.parse.unquote_plus, + albeit in a faster, more straightforward manner. + + Args: + encoded_uri: An encoded URI (full or partial). + + Returns: + A decoded URL. If the URL contains escaped non-ASCII + characters, UTF-8 is assumed per RFC 3986. + + """ + + decoded_uri = encoded_uri + + # PERF(kgriffs): Don't take the time to instantiate a new + # string unless we have to. + if '+' in decoded_uri: + decoded_uri = decoded_uri.replace('+', ' ') + + # Short-circuit if we can + if '%' not in decoded_uri: + return decoded_uri + + # NOTE(kgriffs): Clients should never submit a URI that has + # unescaped non-ASCII chars in them, but just in case they + # do, let's encode into a non-lossy format. + decoded_uri = decoded_uri.encode('utf-8') + + # Replace escape sequences + decoded_uri = _ESCAPE_SEQUENCE.sub(_unescape, decoded_uri) + + # Back to str + return decoded_uri.decode('utf-8', 'replace') diff --git a/tests/test_utils.py b/tests/test_utils.py index 39bcd52..fc7ea7e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -110,6 +110,8 @@ class TestFalconUtils(testtools.TestCase): self.assertEqual(uri.encode_value(u'abcd'), u'abcd') self.assertEqual(uri.encode_value(u'ab cd'), u'ab%20cd') self.assertEqual(uri.encode_value(u'\u00e7'), '%C3%A7') + self.assertEqual(uri.encode_value(u'\u00e7\u20ac'), + '%C3%A7%E2%82%AC') self.assertEqual(uri.encode_value('ab/cd'), 'ab%2Fcd') self.assertEqual(uri.encode_value('ab+cd=42,9'), 'ab%2Bcd%3D42%2C9') @@ -118,7 +120,13 @@ class TestFalconUtils(testtools.TestCase): self.assertEqual(uri.decode('abcd'), 'abcd') self.assertEqual(uri.decode(u'abcd'), u'abcd') self.assertEqual(uri.decode(u'ab%20cd'), u'ab cd') - self.assertEqual(uri.decode('%C3%A7'), u'\u00e7') + + self.assertEqual(uri.decode('This thing is %C3%A7'), + u'This thing is \u00e7') + + self.assertEqual(uri.decode('This thing is %C3%A7%E2%82%AC'), + u'This thing is \u00e7\u20ac') + self.assertEqual(uri.decode('ab%2Fcd'), 'ab/cd') self.assertEqual(uri.decode('http://example.com?x=ab%2Bcd%3D42%2C9'), @@ -145,6 +153,8 @@ class TestFalconUtils(testtools.TestCase): def test_prop_uri_decode_models_stdlib_unquote_plus(self): stdlib_unquote = six.moves.urllib.parse.unquote_plus for case in self.uris: + case = uri.encode_value(case) + expect = stdlib_unquote(case) actual = uri.decode(case) self.assertEqual(expect, actual)