perf(Request): Optimize percent_unescape

Do less work by checking for existence of '+' before creating a new
string, and also only decoding from UTF-8 when there is a chance
the source string has some multibyte sequences in it.
This commit is contained in:
kgriffs
2014-01-03 11:52:45 -06:00
parent b04a3a095b
commit 42930573b9
4 changed files with 97 additions and 29 deletions

View File

@@ -35,8 +35,10 @@ import six
from falcon.exceptions import HTTPBadRequest
from falcon import util
from falcon.util import uri
from falcon import request_helpers as helpers
DEFAULT_ERROR_LOG_FORMAT = (u'{0:%Y-%m-%d %H:%M:%S} [FALCON] [ERROR]'
u' {1} {2}{3} => ')
@@ -108,7 +110,18 @@ class Request(object):
# QUERY_STRING isn't required to be in env, so let's check
# PERF: if...in is faster than using env.get(...)
if 'QUERY_STRING' in env and env['QUERY_STRING']:
self.query_string = util.percent_unescape(env['QUERY_STRING'])
# TODO(kgriffs): Should this escape individual values instead
# of the entire string? The way it is now, this:
#
# x=ab%2Bcd%3D42%2C9
#
# becomes this:
#
# x=ab+cd=42,9
#
self.query_string = uri.decode(env['QUERY_STRING'])
else:
self.query_string = six.text_type()

View File

@@ -18,15 +18,9 @@ limitations under the License.
import datetime
import functools
import six
import inspect
import warnings
if six.PY3: # pragma nocover
import urllib.parse as urllib # pylint: disable=E0611
else: # pragma nocover
import urllib
from falcon.util import uri
@@ -149,18 +143,5 @@ def to_query_str(params):
# TODO(kgriffs): Remove this alias in Falcon v0.2.0
percent_escape = uri.encode
# TODO(kgriffs): Move this to falcon.uri.decode in Falcon v0.2.0
def percent_unescape(nstr):
"""Percent-unescape an input native string into a url.
Args:
nstr: A URL in native string (\u0000 - \u00FF).
Returns:
A URL as a python string, decoded as UTF-8.
"""
s = urllib.unquote_plus(nstr)
return s if six.PY3 else s.decode('utf-8', 'replace')
# TODO(kgriffs): Remove this alias in Falcon v0.2.0
percent_unescape = uri.decode

View File

@@ -18,6 +18,11 @@ limitations under the License.
import six
if six.PY3: # pragma nocover
import urllib.parse as urllib # pylint: disable=E0611
else: # pragma nocover
import urllib
# NOTE(kgriffs): See also RFC 3986
_UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@@ -134,3 +139,57 @@ Returns:
have been percent-encoded.
"""
def decode(uri):
"""Decode any percent-encoded characters in a URI or query string.
Args:
uri: An encoded URI (full or partial). If of type str on Python 2,
UTF-8 is assumed.
Returns:
A decoded URL. Will be of type `unicode` on Python 2 IFF `uri`
contains percent-encoded chars (in which case there is a chance
they might contain multibyte Unicode sequences).
"""
encoded_uri = uri
#
# TODO(kgriffs): urllib is broken when it comes to decoding
# non-ASCII strings on Python 2. The problem is, if you pass
# it a str type, it doesn't even try to decode the character
# set. On the other hand, if you pass it a unicode type, urllib
# simply decodes code points as latin1 (not exactly a sensible
# default, eh?).
#
# So, we could just let urllib do its thing and after the fact
# decode the result like so:
#
# if six.PY2 and isinstance(encoded_uri, str): # pragma nocover
# encoded_uri = encoded_uri.decode('utf-8', 'replace')
#
# However, that adds several microseconds and will rarely be
# needed by the caller who is probably just decoding a query
# string, and it is not common to put non-ASCII characters in
# a cloud API's query string (please contact me if I am wrong!).
#
# PERF(kgriffs): unquote_plus can do this, but if there are
# *only* plusses in the string, no '%', we can save a lot of
# time!
if '+' in encoded_uri:
encoded_uri = encoded_uri.replace('+', ' ')
if '%' in encoded_uri:
encoded_uri = urllib.unquote(encoded_uri)
# PERF(kgriffs): Only spend the time to do this if there
# is a chance there were multibyte, UTF-8 encoded
# sequences that were percent-encoded.
if six.PY2 and isinstance(encoded_uri, str): # pragma nocover
encoded_uri = encoded_uri.decode('utf-8', 'replace')
return encoded_uri

View File

@@ -73,19 +73,34 @@ class TestFalconUtils(testtools.TestCase):
self.assertEqual(expected, garbage_out)
def test_percent_escape(self):
def test_uri_encode(self):
url = 'http://example.com/v1/fizbit/messages?limit=3&echo=true'
self.assertEqual(falcon.percent_escape(url), url)
self.assertEqual(uri.encode(url), url)
url2a = u'http://example.com/v1/fizbit/messages?limit=3&e\u00e7ho=true'
url2b = 'http://example.com/v1/fizbit/messages?limit=3&e%C3%A7ho=true'
self.assertEqual(falcon.percent_escape(url2a), url2b)
url = 'http://example.com/v1/fiz bit/messages'
expected = 'http://example.com/v1/fiz%20bit/messages'
self.assertEqual(uri.encode(url), expected)
def test_decode_value(self):
url = u'http://example.com/v1/fizbit/messages?limit=3&e\u00e7ho=true'
expected = ('http://example.com/v1/fizbit/messages'
'?limit=3&e%C3%A7ho=true')
self.assertEqual(uri.encode(url), expected)
def test_uri_encode_value(self):
self.assertEqual(uri.encode_value('abcd'), 'abcd')
self.assertEqual(uri.encode_value(u'abcd'), u'abcd')
self.assertEqual(uri.encode_value(u'ab cd'), u'ab%20cd')
self.assertEqual(uri.encode_value(u'\u00e7'), u'%C3%A7')
self.assertEqual(uri.encode_value(u'\u00e7'), '%C3%A7')
self.assertEqual(uri.encode_value('ab/cd'), 'ab%2Fcd')
self.assertEqual(uri.encode_value('ab+cd=42,9'),
'ab%2Bcd%3D42%2C9')
def test_uri_decode(self):
self.assertEqual(uri.decode('abcd'), 'abcd')
self.assertEqual(uri.decode(u'abcd'), u'abcd')
self.assertEqual(uri.decode(u'ab%20cd'), u'ab cd')
self.assertEqual(uri.decode('%C3%A7'), u'\u00e7')
self.assertEqual(uri.decode('ab%2Fcd'), 'ab/cd')
self.assertEqual(uri.decode('http://example.com?x=ab%2Bcd%3D42%2C9'),
'http://example.com?x=ab+cd=42,9')