Add urlparse function, rename urlparse module

- Start testing by testing the urlparse function doesn't error on the unicode
  snowman domain

- Add strict keyword argument to ParseResult.from_string to turn off strict
  authority validation.

- Add InvalidPort exception to be very specific when trying to convert a port
  string to an int fails.

- Add stdlib compatibility shims
This commit is contained in:
Ian Cordasco
2015-07-04 15:37:18 -05:00
parent 4dcde4c99b
commit 6d7d593272
5 changed files with 111 additions and 12 deletions

View File

@@ -29,8 +29,17 @@ __author__ = 'Ian Cordasco'
__author_email__ = 'ian.cordasco@rackspace.com'
__license__ = 'Apache v2.0'
__copyright__ = 'Copyright 2014 Rackspace'
__version__ = '0.2.2'
__version__ = '0.3.0.dev1'
from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri)
from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri,
urlparse)
from .parseresult import ParseResult
__all__ = ['URIReference', 'uri_reference', 'is_valid_uri', 'normalize_uri']
__all__ = (
'ParseResult',
'URIReference',
'is_valid_uri',
'normalize_uri',
'uri_reference',
'urlparse',
)

View File

@@ -21,6 +21,7 @@ provides access to the class ``URIReference``.
"""
from .uri import URIReference
from .parseresult import ParseResult
def uri_reference(uri, encoding='utf-8'):
@@ -76,3 +77,16 @@ def normalize_uri(uri, encoding='utf-8'):
"""
normalized_reference = URIReference.from_string(uri, encoding).normalize()
return normalized_reference.unsplit()
def urlparse(uri, encoding='utf-8'):
"""Parse a given URI and return a ParseResult.
This is a partial replacement of the standard library's urlparse function.
:param str uri: The URI to be parsed.
:param str encoding: The encoding of the string provided.
:returns: A parsed URI
:rtype: :class:`~rfc3986.parseresult.ParseResult`
"""
return ParseResult.from_string(uri, encoding, strict=False)

View File

@@ -9,6 +9,12 @@ class InvalidAuthority(RFC3986Exception):
"The authority ({0}) is not valid.".format(authority))
class InvalidPort(RFC3986Exception):
def __init__(self, port):
super(InvalidPort, self).__init__(
'The port ("{0}") is not valid.'.format(port))
class ResolutionError(RFC3986Exception):
def __init__(self, uri):
super(ResolutionError, self).__init__(

View File

@@ -14,6 +14,7 @@
# limitations under the License.
from collections import namedtuple
from . import exceptions
from . import normalizers
from . import uri
@@ -40,19 +41,35 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)):
return parse_result
@classmethod
def from_string(cls, uri_string, encoding='utf-8'):
def from_string(cls, uri_string, encoding='utf-8', strict=True):
"""Parse a URI from the given unicode URI string.
:param str uri_string: Unicode URI to be parsed into a reference.
:param str encoding: The encoding of the string provided
:param bool strict: Parse strictly according to :rfc:`3986` if True.
If False, parse similarly to the standard library's urlparse
function.
:returns: :class:`ParseResult` or subclass thereof
"""
reference = uri.URIReference.from_string(uri_string, encoding)
subauthority = reference.authority_info()
# Thanks to Richard Barrell for this idea:
# https://twitter.com/0x2ba22e11/status/617338811975139328
userinfo, host, port = (subauthority.get(p)
for p in ('userinfo', 'host', 'port'))
try:
subauthority = reference.authority_info()
except exceptions.InvalidAuthority:
if strict:
raise
userinfo, host, port = split_authority(reference.authority)
else:
# Thanks to Richard Barrell for this idea:
# https://twitter.com/0x2ba22e11/status/617338811975139328
userinfo, host, port = (subauthority.get(p)
for p in ('userinfo', 'host', 'port'))
if port:
try:
port = int(port)
except ValueError:
raise exceptions.InvalidPort(port)
return cls(scheme=reference.scheme,
userinfo=userinfo,
host=host,
@@ -101,6 +118,25 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)):
fragment=attrs_dict.get('fragment'))
return ParseResult(uri_ref=ref, **attrs_dict)
def geturl(self):
"""Standard library shim to the unsplit method."""
return self.unsplit()
@property
def hostname(self):
"""Standard library shim for the host portion of the URI."""
return self.host
@property
def netloc(self):
"""Standard library shim for the authority portion of the URI."""
return self.authority
@property
def params(self):
"""Standard library shim for the query portion of the URI."""
return self.query
def unsplit(self):
"""Create a URI string from the components.
@@ -108,3 +144,27 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)):
:rtype: str
"""
return self.reference.unsplit()
def split_authority(authority):
# Initialize our expected return values
userinfo = host = port = None
# Initialize an extra var we may need to use
extra_host = None
# Set-up rest in case there is no userinfo portion
rest = authority
if u'@' in authority:
userinfo, rest = authority.rsplit(u'@', 1)
# Handle IPv6 host addresses
if rest.startswith(u'['):
host, rest = rest.split(u']', 1)
if ':' in rest:
extra_host, port = rest.split(u':', 1)
if extra_host and not host:
host = extra_host
return userinfo, host, port

View File

@@ -1,10 +1,14 @@
# -*- coding: utf-8 -*-
from rfc3986 import uri_reference
from rfc3986 import urlparse
SNOWMAN = b'\xe2\x98\x83'
def test_unicode_uri():
url_bytestring = b'http://example.com?utf8=\xe2\x98\x83'
url_bytestring = b'http://example.com?utf8=' + SNOWMAN
unicode_url = url_bytestring.decode('utf-8')
uri = uri_reference(unicode_url)
assert uri.is_valid() is True
@@ -12,15 +16,21 @@ def test_unicode_uri():
def test_unicode_uri_passed_as_bytes():
url_bytestring = b'http://example.com?utf8=\xe2\x98\x83'
url_bytestring = b'http://example.com?utf8=' + SNOWMAN
uri = uri_reference(url_bytestring)
assert uri.is_valid() is True
assert uri == 'http://example.com?utf8=%E2%98%83'
def test_unicode_authority():
url_bytestring = b'http://\xe2\x98\x83.com'
url_bytestring = b'http://' + SNOWMAN + b'.com'
unicode_url = url_bytestring.decode('utf-8')
uri = uri_reference(unicode_url)
assert uri.is_valid() is False
assert uri == unicode_url
def test_unicode_hostname():
url_bytestring = b'http://' + SNOWMAN + b'.com'
parsed = urlparse(url_bytestring)
assert parsed