Add urlparse function, rename urlparse module

- Start testing by testing the urlparse function doesn't error on the unicode snowman domain - Add strict keyword argument to ParseResult.from_string to turn off strict authority validation. - Add InvalidPort exception to be very specific when trying to convert a port string to an int fails. - Add stdlib compatibility shims
2015-07-04 15:37:18 -05:00
parent 4dcde4c99b
commit 6d7d593272
5 changed files with 111 additions and 12 deletions
--- a/rfc3986/init.py
+++ b/rfc3986/init.py
@@ -29,8 +29,17 @@ __author__ = 'Ian Cordasco'
 __author_email__ = 'ian.cordasco@rackspace.com'
 __license__ = 'Apache v2.0'
 __copyright__ = 'Copyright 2014 Rackspace'
-__version__ = '0.2.2'
+__version__ = '0.3.0.dev1'

-from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri)
+from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri,
+                  urlparse)
+from .parseresult import ParseResult

-__all__ = ['URIReference', 'uri_reference', 'is_valid_uri', 'normalize_uri']
+__all__ = (
+    'ParseResult',
+    'URIReference',
+    'is_valid_uri',
+    'normalize_uri',
+    'uri_reference',
+    'urlparse',
+)
--- a/rfc3986/api.py
+++ b/rfc3986/api.py
@@ -21,6 +21,7 @@ provides access to the class ``URIReference``.
 """

 from .uri import URIReference
+from .parseresult import ParseResult


 def uri_reference(uri, encoding='utf-8'):
@@ -76,3 +77,16 @@ def normalize_uri(uri, encoding='utf-8'):
    """
    normalized_reference = URIReference.from_string(uri, encoding).normalize()
    return normalized_reference.unsplit()
+
+
+def urlparse(uri, encoding='utf-8'):
+    """Parse a given URI and return a ParseResult.
+
+    This is a partial replacement of the standard library's urlparse function.
+
+    :param str uri: The URI to be parsed.
+    :param str encoding: The encoding of the string provided.
+    :returns: A parsed URI
+    :rtype: :class:`~rfc3986.parseresult.ParseResult`
+    """
+    return ParseResult.from_string(uri, encoding, strict=False)
--- a/rfc3986/exceptions.py
+++ b/rfc3986/exceptions.py
@@ -9,6 +9,12 @@ class InvalidAuthority(RFC3986Exception):
            "The authority ({0}) is not valid.".format(authority))


+class InvalidPort(RFC3986Exception):
+    def __init__(self, port):
+        super(InvalidPort, self).__init__(
+            'The port ("{0}") is not valid.'.format(port))
+
+
 class ResolutionError(RFC3986Exception):
    def __init__(self, uri):
        super(ResolutionError, self).__init__(
--- a/rfc3986/parseresult.py
+++ b/rfc3986/parseresult.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from collections import namedtuple

+from . import exceptions
 from . import normalizers
 from . import uri

@@ -40,19 +41,35 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)):
        return parse_result

    @classmethod
-    def from_string(cls, uri_string, encoding='utf-8'):
+    def from_string(cls, uri_string, encoding='utf-8', strict=True):
        """Parse a URI from the given unicode URI string.

        :param str uri_string: Unicode URI to be parsed into a reference.
        :param str encoding: The encoding of the string provided
+        :param bool strict: Parse strictly according to :rfc:`3986` if True.
+            If False, parse similarly to the standard library's urlparse
+            function.
        :returns: :class:`ParseResult` or subclass thereof
        """
        reference = uri.URIReference.from_string(uri_string, encoding)
-        subauthority = reference.authority_info()
-        # Thanks to Richard Barrell for this idea:
-        # https://twitter.com/0x2ba22e11/status/617338811975139328
-        userinfo, host, port = (subauthority.get(p)
-                                for p in ('userinfo', 'host', 'port'))
+        try:
+            subauthority = reference.authority_info()
+        except exceptions.InvalidAuthority:
+            if strict:
+                raise
+            userinfo, host, port = split_authority(reference.authority)
+        else:
+            # Thanks to Richard Barrell for this idea:
+            # https://twitter.com/0x2ba22e11/status/617338811975139328
+            userinfo, host, port = (subauthority.get(p)
+                                    for p in ('userinfo', 'host', 'port'))
+
+        if port:
+            try:
+                port = int(port)
+            except ValueError:
+                raise exceptions.InvalidPort(port)
+
        return cls(scheme=reference.scheme,
                   userinfo=userinfo,
                   host=host,
@@ -101,6 +118,25 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)):
                                       fragment=attrs_dict.get('fragment'))
        return ParseResult(uri_ref=ref, **attrs_dict)

+    def geturl(self):
+        """Standard library shim to the unsplit method."""
+        return self.unsplit()
+
+    @property
+    def hostname(self):
+        """Standard library shim for the host portion of the URI."""
+        return self.host
+
+    @property
+    def netloc(self):
+        """Standard library shim for the authority portion of the URI."""
+        return self.authority
+
+    @property
+    def params(self):
+        """Standard library shim for the query portion of the URI."""
+        return self.query
+
    def unsplit(self):
        """Create a URI string from the components.

@@ -108,3 +144,27 @@ class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS)):
        :rtype: str
        """
        return self.reference.unsplit()
+
+
+def split_authority(authority):
+    # Initialize our expected return values
+    userinfo = host = port = None
+    # Initialize an extra var we may need to use
+    extra_host = None
+    # Set-up rest in case there is no userinfo portion
+    rest = authority
+
+    if u'@' in authority:
+        userinfo, rest = authority.rsplit(u'@', 1)
+
+    # Handle IPv6 host addresses
+    if rest.startswith(u'['):
+        host, rest = rest.split(u']', 1)
+
+    if ':' in rest:
+        extra_host, port = rest.split(u':', 1)
+
+    if extra_host and not host:
+        host = extra_host
+
+    return userinfo, host, port
--- a/tests/test_unicode_support.py
+++ b/tests/test_unicode_support.py
@@ -1,10 +1,14 @@
 # -*- coding: utf-8 -*-

 from rfc3986 import uri_reference
+from rfc3986 import urlparse
+
+
+SNOWMAN = b'\xe2\x98\x83'


 def test_unicode_uri():
-    url_bytestring = b'http://example.com?utf8=\xe2\x98\x83'
+    url_bytestring = b'http://example.com?utf8=' + SNOWMAN
    unicode_url = url_bytestring.decode('utf-8')
    uri = uri_reference(unicode_url)
    assert uri.is_valid() is True
@@ -12,15 +16,21 @@ def test_unicode_uri():


 def test_unicode_uri_passed_as_bytes():
-    url_bytestring = b'http://example.com?utf8=\xe2\x98\x83'
+    url_bytestring = b'http://example.com?utf8=' + SNOWMAN
    uri = uri_reference(url_bytestring)
    assert uri.is_valid() is True
    assert uri == 'http://example.com?utf8=%E2%98%83'


 def test_unicode_authority():
-    url_bytestring = b'http://\xe2\x98\x83.com'
+    url_bytestring = b'http://' + SNOWMAN + b'.com'
    unicode_url = url_bytestring.decode('utf-8')
    uri = uri_reference(unicode_url)
    assert uri.is_valid() is False
    assert uri == unicode_url
+
+
+def test_unicode_hostname():
+    url_bytestring = b'http://' + SNOWMAN + b'.com'
+    parsed = urlparse(url_bytestring)
+    assert parsed