Files
deb-python-falcon/falcon/util/uri.py
Kurt Griffiths da2bc234f8 doc(reference): Standardize docstring syntax
Unify the use of markup in docstrings, particulary relating to the use of
backticks and asterisks. Also clean up any remaining minor inconsistencies or
errors in the docstrings.

Closes #334
2015-02-03 18:45:52 -06:00

366 lines
13 KiB
Python

# Copyright 2013 by Rackspace Hosting, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
# NOTE(kgriffs): See also RFC 3986
_UNRESERVED = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'-._~')
# NOTE(kgriffs): See also RFC 3986
_DELIMITERS = ":/?#[]@!$&'()*+,;="
_ALL_ALLOWED = _UNRESERVED + _DELIMITERS
_HEX_DIGITS = '0123456789ABCDEFabcdef'
def _create_char_encoder(allowed_chars):
lookup = {}
for code_point in range(256):
if chr(code_point) in allowed_chars:
encoded_char = chr(code_point)
else:
encoded_char = '%{0:02X}'.format(code_point)
# NOTE(kgriffs): PY2 returns str from uri.encode, while
# PY3 returns a byte array.
key = chr(code_point) if six.PY2 else code_point
lookup[key] = encoded_char
return lookup.__getitem__
def _create_str_encoder(is_value):
allowed_chars = _UNRESERVED if is_value else _ALL_ALLOWED
encode_char = _create_char_encoder(allowed_chars)
def encoder(uri):
# PERF(kgriffs): Very fast way to check, learned from urlib.quote
if not uri.rstrip(allowed_chars):
return uri
# Convert to a byte array if it is not one already
#
# NOTE(kgriffs): Code coverage disabled since in Py3K the uri
# is always a text type, so we get a failure for that tox env.
if isinstance(uri, six.text_type): # pragma no cover
uri = uri.encode('utf-8')
# Use our map to encode each char and join the result into a new uri
#
# PERF(kgriffs): map is faster than list comp on py27, but a tiny bit
# slower on py33. Since we are already much faster than urllib on
# py33, let's optimize for py27.
return ''.join(map(encode_char, uri))
return encoder
encode = _create_str_encoder(False)
encode.__name__ = 'encode'
encode.__doc__ = """Encodes a full or relative URI according to RFC 3986.
RFC 3986 defines a set of "unreserved" characters as well as a
set of "reserved" characters used as delimiters. This function escapes
all other "disallowed" characters by percent-encoding them.
Note:
This utility is faster in the average case than the similar
`quote` function found in ``urlib``. It also strives to be easier
to use by assuming a sensible default of allowed characters.
Args:
uri (str): URI or part of a URI to encode. If this is a wide
string (i.e., ``six.text_type``), it will be encoded to
a UTF-8 byte array and any multibyte sequences will
be percent-encoded as-is.
Returns:
str: An escaped version of `uri`, where all disallowed characters
have been percent-encoded.
"""
encode_value = _create_str_encoder(True)
encode_value.name = 'encode_value'
encode_value.__doc__ = """Encodes a value string according to RFC 3986.
Disallowed characters are percent-encoded in a way that models
``urllib.parse.quote(safe="~")``. However, the Falcon function is faster
in the average case than the similar `quote` function found in urlib.
It also strives to be easier to use by assuming a sensible default
of allowed characters.
All reserved characters are lumped together into a single set of
"delimiters", and everything in that set is escaped.
Note:
RFC 3986 defines a set of "unreserved" characters as well as a
set of "reserved" characters used as delimiters.
Args:
uri (str): URI fragment to encode. It is assumed not to cross delimiter
boundaries, and so any reserved URI delimiter characters
included in it will be escaped. If `value` is a wide
string (i.e., ``six.text_type``), it will be encoded to
a UTF-8 byte array and any multibyte sequences will
be percent-encoded as-is.
Returns:
str: An escaped version of `uri`, where all disallowed characters
have been percent-encoded.
"""
# NOTE(kgriffs): This is actually covered, but not in py33; hence the pragma
if six.PY2: # pragma: no cover
# This map construction is based on urllib
_HEX_TO_BYTE = dict((a + b, (chr(int(a + b, 16)), int(a + b, 16)))
for a in _HEX_DIGITS
for b in _HEX_DIGITS)
def decode(encoded_uri):
"""Decodes percent-encoded characters in a URI or query string.
This function models the behavior of `urllib.parse.unquote_plus`, but
is faster. It is also more robust, in that it will decode escaped
UTF-8 mutibyte sequences.
Args:
encoded_uri (str): An encoded URI (full or partial).
Returns:
str: A decoded URL. Will be of type ``unicode`` on Python 2 IFF the
URL contained escaped non-ASCII characters, in which case
UTF-8 is assumed per RFC 3986.
"""
decoded_uri = encoded_uri
# PERF(kgriffs): Don't take the time to instantiate a new
# string unless we have to.
if '+' in decoded_uri:
decoded_uri = decoded_uri.replace('+', ' ')
# Short-circuit if we can
if '%' not in decoded_uri:
return decoded_uri
# Convert to bytes because we are about to replace chars and we
# don't want Python to mistakenly interpret any high bits.
if not isinstance(decoded_uri, str):
# NOTE(kgriffs): Clients should never submit a URI that has
# unescaped non-ASCII chars in them, but just in case they
# do, let's encode in a non-lossy format.
decoded_uri = decoded_uri.encode('utf-8')
only_ascii = True
tokens = decoded_uri.split('%')
decoded_uri = tokens[0]
for token in tokens[1:]:
char, byte = _HEX_TO_BYTE[token[:2]]
decoded_uri += char + token[2:]
only_ascii = only_ascii and (byte <= 127)
# PERF(kgriffs): Only spend the time to do this if there
# were non-ascii bytes found in the string.
if not only_ascii:
decoded_uri = decoded_uri.decode('utf-8', 'replace')
return decoded_uri
# NOTE(kgriffs): This is actually covered, but not in py2x; hence the pragma
else: # pragma: no cover
# This map construction is based on urllib
_HEX_TO_BYTE = dict(((a + b).encode(), bytes([int(a + b, 16)]))
for a in _HEX_DIGITS
for b in _HEX_DIGITS)
def _unescape(matchobj):
# NOTE(kgriffs): Strip '%' and convert the hex number
return _HEX_TO_BYTE[matchobj.group(0)[1:]]
def decode(encoded_uri):
"""Decodes percent-encoded characters in a URI or query string.
This function models the behavior of `urllib.parse.unquote_plus`,
albeit in a faster, more straightforward manner.
Args:
encoded_uri (str): An encoded URI (full or partial).
Returns:
str: A decoded URL. If the URL contains escaped non-ASCII
characters, UTF-8 is assumed per RFC 3986.
"""
decoded_uri = encoded_uri
# PERF(kgriffs): Don't take the time to instantiate a new
# string unless we have to.
if '+' in decoded_uri:
decoded_uri = decoded_uri.replace('+', ' ')
# Short-circuit if we can
if '%' not in decoded_uri:
return decoded_uri
# NOTE(kgriffs): Clients should never submit a URI that has
# unescaped non-ASCII chars in them, but just in case they
# do, let's encode into a non-lossy format.
decoded_uri = decoded_uri.encode('utf-8')
# PERF(kgriffs): This was found to be faster than using
# a regex sub call or list comprehension with a join.
tokens = decoded_uri.split(b'%')
decoded_uri = tokens[0]
for token in tokens[1:]:
decoded_uri += _HEX_TO_BYTE[token[:2]] + token[2:]
# Convert back to str
return decoded_uri.decode('utf-8', 'replace')
def parse_query_string(query_string, keep_blank_qs_values=False):
"""Parse a query string into a dict.
Query string parameters are assumed to use standard form-encoding. Only
parameters with values are parsed. for example, given 'foo=bar&flag',
this function would ignore 'flag' unless the `keep_blank_qs_values` option
is set.
Note:
In addition to the standard HTML form-based method for specifying
lists by repeating a given param multiple times, Falcon supports
a more compact form in which the param may be given a single time
but set to a ``list`` of comma-separated elements (e.g., 'foo=a,b,c').
The two different ways of specifying lists may not be mixed in
a single query string for the same parameter.
Args:
query_string (str): The query string to parse.
keep_blank_qs_values (bool): If set to ``True``, preserves boolean
fields and fields with no content as blank strings.
Returns:
dict: A dictionary of (*name*, *value*) pairs, one per query
parameter. Note that *value* may be a single ``str``, or a
``list`` of ``str``.
Raises:
TypeError: `query_string` was not a ``str``.
"""
params = {}
# PERF(kgriffs): This was found to be faster than using a regex, for
# both short and long query strings. Tested on both CPython 2.7 and 3.4,
# and on PyPy 2.3.
for field in query_string.split('&'):
k, _, v = field.partition('=')
if not (v or keep_blank_qs_values):
continue
if k in params:
# The key was present more than once in the POST data. Convert to
# a list, or append the next value to the list.
old_value = params[k]
if isinstance(old_value, list):
old_value.append(v)
else:
params[k] = [old_value, v]
else:
if ',' in v:
# NOTE(kgriffs): Falcon supports a more compact form of
# lists, in which the elements are comma-separated and
# assigned to a single param instance. If it turns out that
# very few people use this, it can be deprecated at some
# point.
v = v.split(',')
if not keep_blank_qs_values:
# NOTE(kgriffs): Normalize the result in the case that
# some elements are empty strings, such that the result
# will be the same for 'foo=1,,3' as 'foo=1&foo=&foo=3'.
v = [element for element in v if element]
params[k] = v
return params
def parse_host(host, default_port=None):
"""Parse a canonical 'host:port' string into parts.
Parse a host string (which may or may not contain a port) into
parts, taking into account that the string may contain
either a domain name or an IP address. In the latter case,
both IPv4 and IPv6 addresses are supported.
Args:
host (str): Host string to parse, optionally containing a
port number.
default_port (int, optional): Port number to return when
the host string does not contain one (default ``None``).
Returns:
tuple: A parsed (*host*, *port*) tuple from the given
host string, with the port converted to an ``int``.
If the host string does not specify a port, `default_port` is
used instead.
"""
# NOTE(kgriff): The value from the Host header may
# contain a port, so check that and strip it if
# necessary. This is complicated by the fact that
# a hostname may be specified either as an IP address
# or as a domain name, and in the case of IPv6 there
# may be multiple colons in the string.
if host.startswith('['):
# IPv6 address with a port
pos = host.rfind(']:')
if pos != -1:
return (host[1:pos], int(host[pos + 2:]))
else:
return (host[1:-1], default_port)
pos = host.rfind(':')
if (pos == -1) or (pos != host.find(':')):
# Bare domain name or IP address
return (host, default_port)
# NOTE(kgriffs): At this point we know that there was
# only a single colon, so we should have an IPv4 address
# or a domain name plus a port
name, _, port = host.partition(':')
return (name, int(port))