diff --git a/rfc3986/__init__.py b/rfc3986/__init__.py index 14d0ea0..72652cf 100644 --- a/rfc3986/__init__.py +++ b/rfc3986/__init__.py @@ -29,8 +29,17 @@ __author__ = 'Ian Cordasco' __author_email__ = 'ian.cordasco@rackspace.com' __license__ = 'Apache v2.0' __copyright__ = 'Copyright 2014 Rackspace' -__version__ = '0.2.2' +__version__ = '0.3.0.dev1' -from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri) +from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri, + urlparse) +from .parseresult import ParseResult -__all__ = ['URIReference', 'uri_reference', 'is_valid_uri', 'normalize_uri'] +__all__ = ( + 'ParseResult', + 'URIReference', + 'is_valid_uri', + 'normalize_uri', + 'uri_reference', + 'urlparse', +) diff --git a/rfc3986/api.py b/rfc3986/api.py index c993bb8..3e9e401 100644 --- a/rfc3986/api.py +++ b/rfc3986/api.py @@ -21,6 +21,7 @@ provides access to the class ``URIReference``. """ from .uri import URIReference +from .parseresult import ParseResult def uri_reference(uri, encoding='utf-8'): @@ -76,3 +77,16 @@ def normalize_uri(uri, encoding='utf-8'): """ normalized_reference = URIReference.from_string(uri, encoding).normalize() return normalized_reference.unsplit() + + +def urlparse(uri, encoding='utf-8'): + """Parse a given URI and return a ParseResult. + + This is a partial replacement of the standard library's urlparse function. + + :param str uri: The URI to be parsed. + :param str encoding: The encoding of the string provided. + :returns: A parsed URI + :rtype: :class:`~rfc3986.parseresult.ParseResult` + """ + return ParseResult.from_string(uri, encoding, strict=False) diff --git a/rfc3986/exceptions.py b/rfc3986/exceptions.py index fe072e1..f9adbde 100644 --- a/rfc3986/exceptions.py +++ b/rfc3986/exceptions.py @@ -9,6 +9,12 @@ class InvalidAuthority(RFC3986Exception): "The authority ({0}) is not valid.".format(authority)) +class InvalidPort(RFC3986Exception): + def __init__(self, port): + super(InvalidPort, self).__init__( + 'The port ("{0}") is not valid.'.format(port)) + + class ResolutionError(RFC3986Exception): def __init__(self, uri): super(ResolutionError, self).__init__( diff --git a/rfc3986/urlparse.py b/rfc3986/parseresult.py similarity index 66% rename from rfc3986/urlparse.py rename to rfc3986/parseresult.py index d87d10e..5bb888c 100644 --- a/rfc3986/urlparse.py +++ b/rfc3986/parseresult.py @@ -14,6 +14,7 @@ # limitations under the License. from collections import namedtuple +from . import exceptions from . import normalizers from . import uri @@ -40,19 +41,35 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)): return parse_result @classmethod - def from_string(cls, uri_string, encoding='utf-8'): + def from_string(cls, uri_string, encoding='utf-8', strict=True): """Parse a URI from the given unicode URI string. :param str uri_string: Unicode URI to be parsed into a reference. :param str encoding: The encoding of the string provided + :param bool strict: Parse strictly according to :rfc:`3986` if True. + If False, parse similarly to the standard library's urlparse + function. :returns: :class:`ParseResult` or subclass thereof """ reference = uri.URIReference.from_string(uri_string, encoding) - subauthority = reference.authority_info() - # Thanks to Richard Barrell for this idea: - # https://twitter.com/0x2ba22e11/status/617338811975139328 - userinfo, host, port = (subauthority.get(p) - for p in ('userinfo', 'host', 'port')) + try: + subauthority = reference.authority_info() + except exceptions.InvalidAuthority: + if strict: + raise + userinfo, host, port = split_authority(reference.authority) + else: + # Thanks to Richard Barrell for this idea: + # https://twitter.com/0x2ba22e11/status/617338811975139328 + userinfo, host, port = (subauthority.get(p) + for p in ('userinfo', 'host', 'port')) + + if port: + try: + port = int(port) + except ValueError: + raise exceptions.InvalidPort(port) + return cls(scheme=reference.scheme, userinfo=userinfo, host=host, @@ -101,6 +118,25 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)): fragment=attrs_dict.get('fragment')) return ParseResult(uri_ref=ref, **attrs_dict) + def geturl(self): + """Standard library shim to the unsplit method.""" + return self.unsplit() + + @property + def hostname(self): + """Standard library shim for the host portion of the URI.""" + return self.host + + @property + def netloc(self): + """Standard library shim for the authority portion of the URI.""" + return self.authority + + @property + def params(self): + """Standard library shim for the query portion of the URI.""" + return self.query + def unsplit(self): """Create a URI string from the components. @@ -108,3 +144,27 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)): :rtype: str """ return self.reference.unsplit() + + +def split_authority(authority): + # Initialize our expected return values + userinfo = host = port = None + # Initialize an extra var we may need to use + extra_host = None + # Set-up rest in case there is no userinfo portion + rest = authority + + if u'@' in authority: + userinfo, rest = authority.rsplit(u'@', 1) + + # Handle IPv6 host addresses + if rest.startswith(u'['): + host, rest = rest.split(u']', 1) + + if ':' in rest: + extra_host, port = rest.split(u':', 1) + + if extra_host and not host: + host = extra_host + + return userinfo, host, port diff --git a/tests/test_unicode_support.py b/tests/test_unicode_support.py index 0d6ea86..798d043 100644 --- a/tests/test_unicode_support.py +++ b/tests/test_unicode_support.py @@ -1,10 +1,14 @@ # -*- coding: utf-8 -*- from rfc3986 import uri_reference +from rfc3986 import urlparse + + +SNOWMAN = b'\xe2\x98\x83' def test_unicode_uri(): - url_bytestring = b'http://example.com?utf8=\xe2\x98\x83' + url_bytestring = b'http://example.com?utf8=' + SNOWMAN unicode_url = url_bytestring.decode('utf-8') uri = uri_reference(unicode_url) assert uri.is_valid() is True @@ -12,15 +16,21 @@ def test_unicode_uri(): def test_unicode_uri_passed_as_bytes(): - url_bytestring = b'http://example.com?utf8=\xe2\x98\x83' + url_bytestring = b'http://example.com?utf8=' + SNOWMAN uri = uri_reference(url_bytestring) assert uri.is_valid() is True assert uri == 'http://example.com?utf8=%E2%98%83' def test_unicode_authority(): - url_bytestring = b'http://\xe2\x98\x83.com' + url_bytestring = b'http://' + SNOWMAN + b'.com' unicode_url = url_bytestring.decode('utf-8') uri = uri_reference(unicode_url) assert uri.is_valid() is False assert uri == unicode_url + + +def test_unicode_hostname(): + url_bytestring = b'http://' + SNOWMAN + b'.com' + parsed = urlparse(url_bytestring) + assert parsed