perf(Request): Optimize percent_unescape

Do less work by checking for existence of '+' before creating a new string, and also only decoding from UTF-8 when there is a chance the source string has some multibyte sequences in it.
2014-01-03 11:52:45 -06:00
parent b04a3a095b
commit 42930573b9
4 changed files with 97 additions and 29 deletions
--- a/falcon/request.py
+++ b/falcon/request.py
@@ -35,8 +35,10 @@ import six

 from falcon.exceptions import HTTPBadRequest
 from falcon import util
+from falcon.util import uri
 from falcon import request_helpers as helpers

+
 DEFAULT_ERROR_LOG_FORMAT = (u'{0:%Y-%m-%d %H:%M:%S} [FALCON] [ERROR]'
                            u' {1} {2}{3} => ')

@@ -108,7 +110,18 @@ class Request(object):
        # QUERY_STRING isn't required to be in env, so let's check
        # PERF: if...in is faster than using env.get(...)
        if 'QUERY_STRING' in env and env['QUERY_STRING']:
-            self.query_string = util.percent_unescape(env['QUERY_STRING'])
+
+            # TODO(kgriffs): Should this escape individual values instead
+            # of the entire string? The way it is now, this:
+            #
+            #   x=ab%2Bcd%3D42%2C9
+            #
+            # becomes this:
+            #
+            #   x=ab+cd=42,9
+            #
+            self.query_string = uri.decode(env['QUERY_STRING'])
+
        else:
            self.query_string = six.text_type()

--- a/falcon/util/misc.py
+++ b/falcon/util/misc.py
@@ -18,15 +18,9 @@ limitations under the License.

 import datetime
 import functools
-import six
 import inspect
 import warnings

-if six.PY3:  # pragma nocover
-    import urllib.parse as urllib  # pylint: disable=E0611
-else:  # pragma nocover
-    import urllib
-
 from falcon.util import uri


@@ -149,18 +143,5 @@ def to_query_str(params):
 # TODO(kgriffs): Remove this alias in Falcon v0.2.0
 percent_escape = uri.encode

-
-# TODO(kgriffs): Move this to falcon.uri.decode in Falcon v0.2.0
-def percent_unescape(nstr):
-    """Percent-unescape an input native string into a url.
-
-    Args:
-        nstr: A URL in native string (\u0000 - \u00FF).
-
-    Returns:
-        A URL as a python string, decoded as UTF-8.
-    """
-
-    s = urllib.unquote_plus(nstr)
-
-    return s if six.PY3 else s.decode('utf-8', 'replace')
+# TODO(kgriffs): Remove this alias in Falcon v0.2.0
+percent_unescape = uri.decode
--- a/falcon/util/uri.py
+++ b/falcon/util/uri.py
@@ -18,6 +18,11 @@ limitations under the License.

 import six

+if six.PY3:  # pragma nocover
+    import urllib.parse as urllib  # pylint: disable=E0611
+else:  # pragma nocover
+    import urllib
+

 # NOTE(kgriffs): See also RFC 3986
 _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@@ -134,3 +139,57 @@ Returns:
    have been percent-encoded.

 """
+
+
+def decode(uri):
+    """Decode any percent-encoded characters in a URI or query string.
+
+    Args:
+        uri: An encoded URI (full or partial). If of type str on Python 2,
+            UTF-8 is assumed.
+
+    Returns:
+        A decoded URL. Will be of type `unicode` on Python 2 IFF `uri`
+        contains percent-encoded chars (in which case there is a chance
+        they might contain multibyte Unicode sequences).
+
+    """
+
+    encoded_uri = uri
+
+    #
+    # TODO(kgriffs): urllib is broken when it comes to decoding
+    # non-ASCII strings on Python 2. The problem is, if you pass
+    # it a str type, it doesn't even try to decode the character
+    # set. On the other hand, if you pass it a unicode type, urllib
+    # simply decodes code points as latin1 (not exactly a sensible
+    # default, eh?).
+    #
+    # So, we could just let urllib do its thing and after the fact
+    # decode the result like so:
+    #
+    # if six.PY2 and isinstance(encoded_uri, str):  # pragma nocover
+    #     encoded_uri = encoded_uri.decode('utf-8', 'replace')
+    #
+    # However, that adds several microseconds and will rarely be
+    # needed by the caller who is probably just decoding a query
+    # string, and it is not common to put non-ASCII characters in
+    # a cloud API's query string (please contact me if I am wrong!).
+    #
+
+    # PERF(kgriffs): unquote_plus can do this, but if there are
+    # *only* plusses in the string, no '%', we can save a lot of
+    # time!
+    if '+' in encoded_uri:
+        encoded_uri = encoded_uri.replace('+', ' ')
+
+    if '%' in encoded_uri:
+        encoded_uri = urllib.unquote(encoded_uri)
+
+        # PERF(kgriffs): Only spend the time to do this if there
+        # is a chance there were multibyte, UTF-8 encoded
+        # sequences that were percent-encoded.
+        if six.PY2 and isinstance(encoded_uri, str):  # pragma nocover
+            encoded_uri = encoded_uri.decode('utf-8', 'replace')
+
+    return encoded_uri
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -73,19 +73,34 @@ class TestFalconUtils(testtools.TestCase):

        self.assertEqual(expected, garbage_out)

-    def test_percent_escape(self):
+    def test_uri_encode(self):
        url = 'http://example.com/v1/fizbit/messages?limit=3&echo=true'
-        self.assertEqual(falcon.percent_escape(url), url)
+        self.assertEqual(uri.encode(url), url)

-        url2a = u'http://example.com/v1/fizbit/messages?limit=3&e\u00e7ho=true'
-        url2b = 'http://example.com/v1/fizbit/messages?limit=3&e%C3%A7ho=true'
-        self.assertEqual(falcon.percent_escape(url2a), url2b)
+        url = 'http://example.com/v1/fiz bit/messages'
+        expected = 'http://example.com/v1/fiz%20bit/messages'
+        self.assertEqual(uri.encode(url), expected)

-    def test_decode_value(self):
+        url = u'http://example.com/v1/fizbit/messages?limit=3&e\u00e7ho=true'
+        expected = ('http://example.com/v1/fizbit/messages'
+                    '?limit=3&e%C3%A7ho=true')
+        self.assertEqual(uri.encode(url), expected)
+
+    def test_uri_encode_value(self):
        self.assertEqual(uri.encode_value('abcd'), 'abcd')
        self.assertEqual(uri.encode_value(u'abcd'), u'abcd')
        self.assertEqual(uri.encode_value(u'ab cd'), u'ab%20cd')
-        self.assertEqual(uri.encode_value(u'\u00e7'), u'%C3%A7')
+        self.assertEqual(uri.encode_value(u'\u00e7'), '%C3%A7')
        self.assertEqual(uri.encode_value('ab/cd'), 'ab%2Fcd')
        self.assertEqual(uri.encode_value('ab+cd=42,9'),
                         'ab%2Bcd%3D42%2C9')
+
+    def test_uri_decode(self):
+        self.assertEqual(uri.decode('abcd'), 'abcd')
+        self.assertEqual(uri.decode(u'abcd'), u'abcd')
+        self.assertEqual(uri.decode(u'ab%20cd'), u'ab cd')
+        self.assertEqual(uri.decode('%C3%A7'), u'\u00e7')
+        self.assertEqual(uri.decode('ab%2Fcd'), 'ab/cd')
+
+        self.assertEqual(uri.decode('http://example.com?x=ab%2Bcd%3D42%2C9'),
+                         'http://example.com?x=ab+cd=42,9')