diff --git a/falcon/request.py b/falcon/request.py index 1e22863..7841999 100644 --- a/falcon/request.py +++ b/falcon/request.py @@ -35,8 +35,10 @@ import six from falcon.exceptions import HTTPBadRequest from falcon import util +from falcon.util import uri from falcon import request_helpers as helpers + DEFAULT_ERROR_LOG_FORMAT = (u'{0:%Y-%m-%d %H:%M:%S} [FALCON] [ERROR]' u' {1} {2}{3} => ') @@ -108,7 +110,18 @@ class Request(object): # QUERY_STRING isn't required to be in env, so let's check # PERF: if...in is faster than using env.get(...) if 'QUERY_STRING' in env and env['QUERY_STRING']: - self.query_string = util.percent_unescape(env['QUERY_STRING']) + + # TODO(kgriffs): Should this escape individual values instead + # of the entire string? The way it is now, this: + # + # x=ab%2Bcd%3D42%2C9 + # + # becomes this: + # + # x=ab+cd=42,9 + # + self.query_string = uri.decode(env['QUERY_STRING']) + else: self.query_string = six.text_type() diff --git a/falcon/util/misc.py b/falcon/util/misc.py index 6185c9a..85d0879 100644 --- a/falcon/util/misc.py +++ b/falcon/util/misc.py @@ -18,15 +18,9 @@ limitations under the License. import datetime import functools -import six import inspect import warnings -if six.PY3: # pragma nocover - import urllib.parse as urllib # pylint: disable=E0611 -else: # pragma nocover - import urllib - from falcon.util import uri @@ -149,18 +143,5 @@ def to_query_str(params): # TODO(kgriffs): Remove this alias in Falcon v0.2.0 percent_escape = uri.encode - -# TODO(kgriffs): Move this to falcon.uri.decode in Falcon v0.2.0 -def percent_unescape(nstr): - """Percent-unescape an input native string into a url. - - Args: - nstr: A URL in native string (\u0000 - \u00FF). - - Returns: - A URL as a python string, decoded as UTF-8. - """ - - s = urllib.unquote_plus(nstr) - - return s if six.PY3 else s.decode('utf-8', 'replace') +# TODO(kgriffs): Remove this alias in Falcon v0.2.0 +percent_unescape = uri.decode diff --git a/falcon/util/uri.py b/falcon/util/uri.py index 8fd1ce3..36883d2 100644 --- a/falcon/util/uri.py +++ b/falcon/util/uri.py @@ -18,6 +18,11 @@ limitations under the License. import six +if six.PY3: # pragma nocover + import urllib.parse as urllib # pylint: disable=E0611 +else: # pragma nocover + import urllib + # NOTE(kgriffs): See also RFC 3986 _UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -134,3 +139,57 @@ Returns: have been percent-encoded. """ + + +def decode(uri): + """Decode any percent-encoded characters in a URI or query string. + + Args: + uri: An encoded URI (full or partial). If of type str on Python 2, + UTF-8 is assumed. + + Returns: + A decoded URL. Will be of type `unicode` on Python 2 IFF `uri` + contains percent-encoded chars (in which case there is a chance + they might contain multibyte Unicode sequences). + + """ + + encoded_uri = uri + + # + # TODO(kgriffs): urllib is broken when it comes to decoding + # non-ASCII strings on Python 2. The problem is, if you pass + # it a str type, it doesn't even try to decode the character + # set. On the other hand, if you pass it a unicode type, urllib + # simply decodes code points as latin1 (not exactly a sensible + # default, eh?). + # + # So, we could just let urllib do its thing and after the fact + # decode the result like so: + # + # if six.PY2 and isinstance(encoded_uri, str): # pragma nocover + # encoded_uri = encoded_uri.decode('utf-8', 'replace') + # + # However, that adds several microseconds and will rarely be + # needed by the caller who is probably just decoding a query + # string, and it is not common to put non-ASCII characters in + # a cloud API's query string (please contact me if I am wrong!). + # + + # PERF(kgriffs): unquote_plus can do this, but if there are + # *only* plusses in the string, no '%', we can save a lot of + # time! + if '+' in encoded_uri: + encoded_uri = encoded_uri.replace('+', ' ') + + if '%' in encoded_uri: + encoded_uri = urllib.unquote(encoded_uri) + + # PERF(kgriffs): Only spend the time to do this if there + # is a chance there were multibyte, UTF-8 encoded + # sequences that were percent-encoded. + if six.PY2 and isinstance(encoded_uri, str): # pragma nocover + encoded_uri = encoded_uri.decode('utf-8', 'replace') + + return encoded_uri diff --git a/tests/test_utils.py b/tests/test_utils.py index 8ffebf8..fce0842 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -73,19 +73,34 @@ class TestFalconUtils(testtools.TestCase): self.assertEqual(expected, garbage_out) - def test_percent_escape(self): + def test_uri_encode(self): url = 'http://example.com/v1/fizbit/messages?limit=3&echo=true' - self.assertEqual(falcon.percent_escape(url), url) + self.assertEqual(uri.encode(url), url) - url2a = u'http://example.com/v1/fizbit/messages?limit=3&e\u00e7ho=true' - url2b = 'http://example.com/v1/fizbit/messages?limit=3&e%C3%A7ho=true' - self.assertEqual(falcon.percent_escape(url2a), url2b) + url = 'http://example.com/v1/fiz bit/messages' + expected = 'http://example.com/v1/fiz%20bit/messages' + self.assertEqual(uri.encode(url), expected) - def test_decode_value(self): + url = u'http://example.com/v1/fizbit/messages?limit=3&e\u00e7ho=true' + expected = ('http://example.com/v1/fizbit/messages' + '?limit=3&e%C3%A7ho=true') + self.assertEqual(uri.encode(url), expected) + + def test_uri_encode_value(self): self.assertEqual(uri.encode_value('abcd'), 'abcd') self.assertEqual(uri.encode_value(u'abcd'), u'abcd') self.assertEqual(uri.encode_value(u'ab cd'), u'ab%20cd') - self.assertEqual(uri.encode_value(u'\u00e7'), u'%C3%A7') + self.assertEqual(uri.encode_value(u'\u00e7'), '%C3%A7') self.assertEqual(uri.encode_value('ab/cd'), 'ab%2Fcd') self.assertEqual(uri.encode_value('ab+cd=42,9'), 'ab%2Bcd%3D42%2C9') + + def test_uri_decode(self): + self.assertEqual(uri.decode('abcd'), 'abcd') + self.assertEqual(uri.decode(u'abcd'), u'abcd') + self.assertEqual(uri.decode(u'ab%20cd'), u'ab cd') + self.assertEqual(uri.decode('%C3%A7'), u'\u00e7') + self.assertEqual(uri.decode('ab%2Fcd'), 'ab/cd') + + self.assertEqual(uri.decode('http://example.com?x=ab%2Bcd%3D42%2C9'), + 'http://example.com?x=ab+cd=42,9')