perf: Custom uri.decode function

This patch replaces urllib.unquote_plus with code that is tuned for performance, and is more robust when it comes to decoding non-ASCII characters.
2014-01-13 23:45:19 -06:00
parent ec3b728d09
commit ff4c73c314
5 changed files with 132 additions and 60 deletions
--- a/falcon/bench/init.py
+++ b/falcon/bench/init.py
@@ -1,3 +1,3 @@
 """Falcon benchmarks"""

-from bench import main  # NOQA
+from falcon.bench.bench import main  # NOQA
--- a/falcon/bench/bench.py
+++ b/falcon/bench/bench.py
@@ -105,7 +105,7 @@ def avg(array):
 def hello_env():
    request_headers = {'Content-Type': 'application/json'}
    return helpers.create_environ('/hello/584/test',
-                                  query_string='limit=10&thing=a%20b',
+                                  query_string='limit=10&thing=ab',
                                  headers=request_headers)


@@ -127,7 +127,8 @@ def run(frameworks, trials, iterations, stat_memory):
    for name in frameworks:
        try:
            create_bench(name, hello_env())
-        except ImportError:
+        except ImportError as ex:
+            print(ex)
            print('Skipping missing library: ' + name)
            del frameworks[frameworks.index(name)]

@@ -167,7 +168,7 @@ def main():

    parser = argparse.ArgumentParser(description="Falcon benchmark runner")
    parser.add_argument('-b', '--benchmark', type=str, action='append',
-                        choices=frameworks, dest='frameworks')
+                        choices=frameworks, dest='frameworks', nargs='+')
    parser.add_argument('-i', '--iterations', type=int, default=50000)
    parser.add_argument('-t', '--trials', type=int, default=3)
    parser.add_argument('-p', '--profile', action='store_true')
@@ -181,6 +182,16 @@ def main():
    if args.frameworks:
        frameworks = args.frameworks

+    # Normalize frameworks type
+    normalized_frameworks = []
+    for one_or_many in frameworks:
+        if isinstance(one_or_many, list):
+            normalized_frameworks.extend(one_or_many)
+        else:
+            normalized_frameworks.append(one_or_many)
+
+    frameworks = normalized_frameworks
+
    # Profile?
    if args.profile:
        for name in frameworks:
--- a/falcon/bench/nuts/nuts/app.py
+++ b/falcon/bench/nuts/nuts/app.py
@@ -1,6 +1,6 @@
 from pecan import make_app

-import controllers.root
+# from .controllers import root


 def create():
--- a/falcon/util/uri.py
+++ b/falcon/util/uri.py
@@ -16,14 +16,10 @@ limitations under the License.

 """

+import re
+
 import six

-if six.PY3:  # pragma nocover
-    import urllib.parse as urllib  # pylint: disable=E0611
-else:  # pragma nocover
-    import urllib
-
-
 # NOTE(kgriffs): See also RFC 3986
 _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
               'abcdefghijklmnopqrstuvwxyz'
@@ -32,9 +28,12 @@ _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

 # NOTE(kgriffs): See also RFC 3986
 _DELIMITERS = ":/?#[]@!$&'()*+,;="
-
 _ALL_ALLOWED = _UNRESERVED + _DELIMITERS

+_ESCAPE_SEQUENCE = re.compile(b'%..')
+_HEX_DIGITS = '0123456789ABCDEFabcdef'
+_UTF8_MAX = 127
+

 def _create_char_encoder(allowed_chars):

@@ -142,62 +141,114 @@ Returns:

 """

+# NOTE(kgriffs): This is actually covered, but not in py33; hence the pragma
+if six.PY2:  # pragma: no cover

-def decode(uri):
-    """Decode any percent-encoded characters in a URI or query string.
+    # This map construction is based on urllib
+    _HEX_TO_BYTE = dict((a + b, (chr(int(a + b, 16)), int(a + b, 16)))
+                        for a in _HEX_DIGITS
+                        for b in _HEX_DIGITS)

-    uri.decode intends to model the behavior of
-    urllib.parse.unquote_plus.
+    def decode(encoded_uri):
+        """Decodes percent-encoded characters in a URI or query string.

-    Args:
-        uri: An encoded URI (full or partial). If of type str on Python 2,
-            UTF-8 is assumed.
+        This function models the behavior of urllib.parse.unquote_plus, but
+        is faster. It is also more robust, in that it will decode escaped
+        UTF-8 mutibyte sequences.

-    Returns:
-        A decoded URL. Will be of type `unicode` on Python 2 IFF `uri`
-        contains percent-encoded chars (in which case there is a chance
-        they might contain multibyte Unicode sequences).
+        Args:
+            encoded_uri: An encoded URI (full or partial).

-    """
+        Returns:
+            A decoded URL. Will be of type `unicode` on Python 2 IFF the
+            URL contained escaped non-ASCII characters, in which case UTF-8
+            is assumed per RFC 3986.

-    encoded_uri = uri
+        """

-    #
-    # TODO(kgriffs): urllib is broken when it comes to decoding
-    # non-ASCII strings on Python 2. The problem is, if you pass
-    # it a str type, it doesn't even try to decode the character
-    # set. On the other hand, if you pass it a unicode type, urllib
-    # simply decodes code points as latin1 (not exactly a sensible
-    # default, eh?).
-    #
-    # So, we could just let urllib do its thing and after the fact
-    # decode the result like so:
-    #
-    # if six.PY2 and isinstance(encoded_uri, str):  # pragma nocover
-    #     encoded_uri = encoded_uri.decode('utf-8', 'replace')
-    #
-    # However, that adds several microseconds and will rarely be
-    # needed by the caller who is probably just decoding a query
-    # string, and it is not common to put non-ASCII characters in
-    # a cloud API's query string (please contact me if I am wrong!).
-    #
+        decoded_uri = encoded_uri

-    # PERF(kgriffs): unquote_plus can do this, but if there are
-    # *only* plusses in the string, no '%', we can save a lot of
-    # time!
-    if '+' in encoded_uri:
-        encoded_uri = encoded_uri.replace('+', ' ')
+        # PERF(kgriffs): Don't take the time to instantiate a new
+        # string unless we have to.
+        if '+' in decoded_uri:
+            decoded_uri = decoded_uri.replace('+', ' ')

-    if '%' in encoded_uri:
-        encoded_uri = urllib.unquote(encoded_uri)
+        # Short-circuit if we can
+        if '%' not in decoded_uri:
+            return decoded_uri
+
+        # Convert to bytes because we are about to replace chars and we
+        # don't want Python to mistakenly interpret any high bits.
+        if not isinstance(decoded_uri, str):
+            # NOTE(kgriffs): Clients should never submit a URI that has
+            # unescaped non-ASCII chars in them, but just in case they
+            # do, let's encode in a non-lossy format.
+            decoded_uri = decoded_uri.encode('utf-8')
+
+        # PERF(kgriffs): Use a closure instead of a class.
+        only_ascii = [True]
+
+        def unescape(matchobj):
+            # NOTE(kgriffs): Strip '%' and convert the hex number
+            char, byte = _HEX_TO_BYTE[matchobj.group(0)[1:]]
+            only_ascii[0] = only_ascii[0] and (byte <= _UTF8_MAX)
+
+            return char
+
+        decoded_uri = _ESCAPE_SEQUENCE.sub(unescape, decoded_uri)

        # PERF(kgriffs): Only spend the time to do this if there
-        # were multibyte, UTF-8 encoded sequences that were
-        # percent-encoded.
-        if six.PY2 and isinstance(encoded_uri, str):  # pragma nocover
-            for byte in bytearray(encoded_uri):
-                if byte > 127:
-                    encoded_uri = encoded_uri.decode('utf-8', 'replace')
-                    break
+        # were non-ascii bytes found in the string.
+        if not only_ascii[0]:
+            decoded_uri = decoded_uri.decode('utf-8', 'replace')

-    return encoded_uri
+        return decoded_uri
+
+# NOTE(kgriffs): This is actually covered, but not in py2x; hence the pragma
+else:  # pragma: no cover
+
+    # This map construction is based on urllib
+    _HEX_TO_BYTE = dict(((a + b).encode(), bytes([int(a + b, 16)]))
+                        for a in _HEX_DIGITS
+                        for b in _HEX_DIGITS)
+
+    def _unescape(matchobj):
+        # NOTE(kgriffs): Strip '%' and convert the hex number
+        return _HEX_TO_BYTE[matchobj.group(0)[1:]]
+
+    def decode(encoded_uri):
+        """Decodes percent-encoded characters in a URI or query string.
+
+        This function models the behavior of urllib.parse.unquote_plus,
+        albeit in a faster, more straightforward manner.
+
+        Args:
+            encoded_uri: An encoded URI (full or partial).
+
+        Returns:
+            A decoded URL. If the URL contains escaped non-ASCII
+            characters, UTF-8 is assumed per RFC 3986.
+
+        """
+
+        decoded_uri = encoded_uri
+
+        # PERF(kgriffs): Don't take the time to instantiate a new
+        # string unless we have to.
+        if '+' in decoded_uri:
+            decoded_uri = decoded_uri.replace('+', ' ')
+
+        # Short-circuit if we can
+        if '%' not in decoded_uri:
+            return decoded_uri
+
+        # NOTE(kgriffs): Clients should never submit a URI that has
+        # unescaped non-ASCII chars in them, but just in case they
+        # do, let's encode into a non-lossy format.
+        decoded_uri = decoded_uri.encode('utf-8')
+
+        # Replace escape sequences
+        decoded_uri = _ESCAPE_SEQUENCE.sub(_unescape, decoded_uri)
+
+        # Back to str
+        return decoded_uri.decode('utf-8', 'replace')
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -110,6 +110,8 @@ class TestFalconUtils(testtools.TestCase):
        self.assertEqual(uri.encode_value(u'abcd'), u'abcd')
        self.assertEqual(uri.encode_value(u'ab cd'), u'ab%20cd')
        self.assertEqual(uri.encode_value(u'\u00e7'), '%C3%A7')
+        self.assertEqual(uri.encode_value(u'\u00e7\u20ac'),
+                         '%C3%A7%E2%82%AC')
        self.assertEqual(uri.encode_value('ab/cd'), 'ab%2Fcd')
        self.assertEqual(uri.encode_value('ab+cd=42,9'),
                         'ab%2Bcd%3D42%2C9')
@@ -118,7 +120,13 @@ class TestFalconUtils(testtools.TestCase):
        self.assertEqual(uri.decode('abcd'), 'abcd')
        self.assertEqual(uri.decode(u'abcd'), u'abcd')
        self.assertEqual(uri.decode(u'ab%20cd'), u'ab cd')
-        self.assertEqual(uri.decode('%C3%A7'), u'\u00e7')
+
+        self.assertEqual(uri.decode('This thing is %C3%A7'),
+                         u'This thing is \u00e7')
+
+        self.assertEqual(uri.decode('This thing is %C3%A7%E2%82%AC'),
+                         u'This thing is \u00e7\u20ac')
+
        self.assertEqual(uri.decode('ab%2Fcd'), 'ab/cd')

        self.assertEqual(uri.decode('http://example.com?x=ab%2Bcd%3D42%2C9'),
@@ -145,6 +153,8 @@ class TestFalconUtils(testtools.TestCase):
    def test_prop_uri_decode_models_stdlib_unquote_plus(self):
        stdlib_unquote = six.moves.urllib.parse.unquote_plus
        for case in self.uris:
+            case = uri.encode_value(case)
+
            expect = stdlib_unquote(case)
            actual = uri.decode(case)
            self.assertEqual(expect, actual)