better integration of urllib: only redefine classes that use i/o
This commit is contained in:
@@ -1,71 +1,19 @@
|
|||||||
"""Open an arbitrary URL.
|
from __future__ import absolute_import
|
||||||
|
import urllib
|
||||||
|
from urllib import *
|
||||||
|
|
||||||
See the following document for more info on URLs:
|
# import the following to be a better drop-in replacement
|
||||||
"Names and Addresses, URIs, URLs, URNs, URCs", at
|
from urllib import (__all__, __version__, MAXFTPCACHE, ContentTooShortError,
|
||||||
http://www.w3.org/pub/WWW/Addressing/Overview.html
|
ftpcache, _noheaders, noheaders, addbase, addclosehook,
|
||||||
|
addinfo, addinfourl, _is_unicode, toBytes, _hextochr,
|
||||||
|
always_safe, getproxies_environment, proxy_bypass)
|
||||||
|
|
||||||
See also the HTTP spec (from which the error codes are derived):
|
|
||||||
"HTTP - Hypertext Transfer Protocol", at
|
|
||||||
http://www.w3.org/pub/WWW/Protocols/
|
|
||||||
|
|
||||||
Related standards and specs:
|
|
||||||
- RFC1808: the "relative URL" spec. (authoritative status)
|
|
||||||
- RFC1738 - the "URL standard". (authoritative status)
|
|
||||||
- RFC1630 - the "URI spec". (informational status)
|
|
||||||
|
|
||||||
The object returned by URLopener().open(file) will differ per
|
|
||||||
protocol. All you know is that is has methods read(), readline(),
|
|
||||||
readlines(), fileno(), close() and info(). The read*(), fileno()
|
|
||||||
and close() methods work like those of open files.
|
|
||||||
The info() method returns a mimetools.Message object which can be
|
|
||||||
used to query various info about the object, if available.
|
|
||||||
(mimetools.Message objects are queried with the getheader() method.)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import string
|
|
||||||
from eventlet.green import socket
|
from eventlet.green import socket
|
||||||
import os
|
import os
|
||||||
from eventlet.green import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
from urlparse import urljoin as basejoin
|
from urlparse import urljoin as basejoin
|
||||||
|
|
||||||
__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
|
|
||||||
"urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
|
|
||||||
"urlencode", "url2pathname", "pathname2url", "splittag",
|
|
||||||
"localhost", "thishost", "ftperrors", "basejoin", "unwrap",
|
|
||||||
"splittype", "splithost", "splituser", "splitpasswd", "splitport",
|
|
||||||
"splitnport", "splitquery", "splitattr", "splitvalue",
|
|
||||||
"splitgophertype", "getproxies"]
|
|
||||||
|
|
||||||
__version__ = '1.17' # XXX This version is not always updated :-(
|
|
||||||
|
|
||||||
MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
|
|
||||||
|
|
||||||
# Helper for non-unix systems
|
|
||||||
if os.name == 'mac':
|
|
||||||
from macurl2path import url2pathname, pathname2url
|
|
||||||
elif os.name == 'nt':
|
|
||||||
from nturl2path import url2pathname, pathname2url
|
|
||||||
elif os.name == 'riscos':
|
|
||||||
from rourl2path import url2pathname, pathname2url
|
|
||||||
else:
|
|
||||||
def url2pathname(pathname):
|
|
||||||
"""OS-specific conversion from a relative URL of the 'file' scheme
|
|
||||||
to a file system path; not recommended for general use."""
|
|
||||||
return unquote(pathname)
|
|
||||||
|
|
||||||
def pathname2url(pathname):
|
|
||||||
"""OS-specific conversion from a file system path to a relative URL
|
|
||||||
of the 'file' scheme; not recommended for general use."""
|
|
||||||
return quote(pathname)
|
|
||||||
|
|
||||||
# This really consists of two pieces:
|
|
||||||
# (1) a class which handles opening of all sorts of URLs
|
|
||||||
# (plus assorted utilities etc.)
|
|
||||||
# (2) a set of functions for parsing URLs
|
|
||||||
# XXX Should these be separated out into different modules?
|
|
||||||
|
|
||||||
|
|
||||||
# Shortcut for basic usage
|
# Shortcut for basic usage
|
||||||
_urlopener = None
|
_urlopener = None
|
||||||
def urlopen(url, data=None, proxies=None):
|
def urlopen(url, data=None, proxies=None):
|
||||||
@@ -91,181 +39,7 @@ def urlcleanup():
|
|||||||
if _urlopener:
|
if _urlopener:
|
||||||
_urlopener.cleanup()
|
_urlopener.cleanup()
|
||||||
|
|
||||||
# exception raised when downloaded size does not match content-length
|
class URLopener(urllib.URLopener):
|
||||||
class ContentTooShortError(IOError):
|
|
||||||
def __init__(self, message, content):
|
|
||||||
IOError.__init__(self, message)
|
|
||||||
self.content = content
|
|
||||||
|
|
||||||
ftpcache = {}
|
|
||||||
class URLopener:
|
|
||||||
"""Class to open URLs.
|
|
||||||
This is a class rather than just a subroutine because we may need
|
|
||||||
more than one set of global protocol-specific options.
|
|
||||||
Note -- this is a base class for those who don't want the
|
|
||||||
automatic handling of errors type 302 (relocated) and 401
|
|
||||||
(authorization needed)."""
|
|
||||||
|
|
||||||
__tempfiles = None
|
|
||||||
|
|
||||||
version = "Python-urllib/%s" % __version__
|
|
||||||
|
|
||||||
# Constructor
|
|
||||||
def __init__(self, proxies=None, **x509):
|
|
||||||
if proxies is None:
|
|
||||||
proxies = getproxies()
|
|
||||||
assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
|
|
||||||
self.proxies = proxies
|
|
||||||
self.key_file = x509.get('key_file')
|
|
||||||
self.cert_file = x509.get('cert_file')
|
|
||||||
self.addheaders = [('User-Agent', self.version)]
|
|
||||||
self.__tempfiles = []
|
|
||||||
self.__unlink = os.unlink # See cleanup()
|
|
||||||
self.tempcache = None
|
|
||||||
# Undocumented feature: if you assign {} to tempcache,
|
|
||||||
# it is used to cache files retrieved with
|
|
||||||
# self.retrieve(). This is not enabled by default
|
|
||||||
# since it does not work for changing documents (and I
|
|
||||||
# haven't got the logic to check expiration headers
|
|
||||||
# yet).
|
|
||||||
self.ftpcache = ftpcache
|
|
||||||
# Undocumented feature: you can use a different
|
|
||||||
# ftp cache by assigning to the .ftpcache member;
|
|
||||||
# in case you want logically independent URL openers
|
|
||||||
# XXX This is not threadsafe. Bah.
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.cleanup()
|
|
||||||
|
|
||||||
def cleanup(self):
|
|
||||||
# This code sometimes runs when the rest of this module
|
|
||||||
# has already been deleted, so it can't use any globals
|
|
||||||
# or import anything.
|
|
||||||
if self.__tempfiles:
|
|
||||||
for file in self.__tempfiles:
|
|
||||||
try:
|
|
||||||
self.__unlink(file)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
del self.__tempfiles[:]
|
|
||||||
if self.tempcache:
|
|
||||||
self.tempcache.clear()
|
|
||||||
|
|
||||||
def addheader(self, *args):
|
|
||||||
"""Add a header to be used by the HTTP interface only
|
|
||||||
e.g. u.addheader('Accept', 'sound/basic')"""
|
|
||||||
self.addheaders.append(args)
|
|
||||||
|
|
||||||
# External interface
|
|
||||||
def open(self, fullurl, data=None):
|
|
||||||
"""Use URLopener().open(file) instead of open(file, 'r')."""
|
|
||||||
fullurl = unwrap(toBytes(fullurl))
|
|
||||||
if self.tempcache and fullurl in self.tempcache:
|
|
||||||
filename, headers = self.tempcache[fullurl]
|
|
||||||
fp = open(filename, 'rb')
|
|
||||||
return addinfourl(fp, headers, fullurl)
|
|
||||||
urltype, url = splittype(fullurl)
|
|
||||||
if not urltype:
|
|
||||||
urltype = 'file'
|
|
||||||
if urltype in self.proxies:
|
|
||||||
proxy = self.proxies[urltype]
|
|
||||||
urltype, proxyhost = splittype(proxy)
|
|
||||||
host, selector = splithost(proxyhost)
|
|
||||||
url = (host, fullurl) # Signal special case to open_*()
|
|
||||||
else:
|
|
||||||
proxy = None
|
|
||||||
name = 'open_' + urltype
|
|
||||||
self.type = urltype
|
|
||||||
name = name.replace('-', '_')
|
|
||||||
if not hasattr(self, name):
|
|
||||||
if proxy:
|
|
||||||
return self.open_unknown_proxy(proxy, fullurl, data)
|
|
||||||
else:
|
|
||||||
return self.open_unknown(fullurl, data)
|
|
||||||
try:
|
|
||||||
if data is None:
|
|
||||||
return getattr(self, name)(url)
|
|
||||||
else:
|
|
||||||
return getattr(self, name)(url, data)
|
|
||||||
except socket.error, msg:
|
|
||||||
raise IOError, ('socket error', msg), sys.exc_info()[2]
|
|
||||||
|
|
||||||
def open_unknown(self, fullurl, data=None):
|
|
||||||
"""Overridable interface to open unknown URL type."""
|
|
||||||
type, url = splittype(fullurl)
|
|
||||||
raise IOError, ('url error', 'unknown url type', type)
|
|
||||||
|
|
||||||
def open_unknown_proxy(self, proxy, fullurl, data=None):
|
|
||||||
"""Overridable interface to open unknown URL type."""
|
|
||||||
type, url = splittype(fullurl)
|
|
||||||
raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
|
|
||||||
|
|
||||||
# External interface
|
|
||||||
def retrieve(self, url, filename=None, reporthook=None, data=None):
|
|
||||||
"""retrieve(url) returns (filename, headers) for a local object
|
|
||||||
or (tempfilename, headers) for a remote object."""
|
|
||||||
url = unwrap(toBytes(url))
|
|
||||||
if self.tempcache and url in self.tempcache:
|
|
||||||
return self.tempcache[url]
|
|
||||||
type, url1 = splittype(url)
|
|
||||||
if filename is None and (not type or type == 'file'):
|
|
||||||
try:
|
|
||||||
fp = self.open_local_file(url1)
|
|
||||||
hdrs = fp.info()
|
|
||||||
del fp
|
|
||||||
return url2pathname(splithost(url1)[1]), hdrs
|
|
||||||
except IOError, msg:
|
|
||||||
pass
|
|
||||||
fp = self.open(url, data)
|
|
||||||
headers = fp.info()
|
|
||||||
if filename:
|
|
||||||
tfp = open(filename, 'wb')
|
|
||||||
else:
|
|
||||||
import tempfile
|
|
||||||
garbage, path = splittype(url)
|
|
||||||
garbage, path = splithost(path or "")
|
|
||||||
path, garbage = splitquery(path or "")
|
|
||||||
path, garbage = splitattr(path or "")
|
|
||||||
suffix = os.path.splitext(path)[1]
|
|
||||||
(fd, filename) = tempfile.mkstemp(suffix)
|
|
||||||
self.__tempfiles.append(filename)
|
|
||||||
tfp = os.fdopen(fd, 'wb')
|
|
||||||
result = filename, headers
|
|
||||||
if self.tempcache is not None:
|
|
||||||
self.tempcache[url] = result
|
|
||||||
bs = 1024*8
|
|
||||||
size = -1
|
|
||||||
read = 0
|
|
||||||
blocknum = 0
|
|
||||||
if reporthook:
|
|
||||||
if "content-length" in headers:
|
|
||||||
size = int(headers["Content-Length"])
|
|
||||||
reporthook(blocknum, bs, size)
|
|
||||||
while 1:
|
|
||||||
block = fp.read(bs)
|
|
||||||
if block == "":
|
|
||||||
break
|
|
||||||
read += len(block)
|
|
||||||
tfp.write(block)
|
|
||||||
blocknum += 1
|
|
||||||
if reporthook:
|
|
||||||
reporthook(blocknum, bs, size)
|
|
||||||
fp.close()
|
|
||||||
tfp.close()
|
|
||||||
del fp
|
|
||||||
del tfp
|
|
||||||
|
|
||||||
# raise exception if actual size does not match content-length header
|
|
||||||
if size >= 0 and read < size:
|
|
||||||
raise ContentTooShortError("retrieval incomplete: got only %i out "
|
|
||||||
"of %i bytes" % (read, size), result)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Each method named open_<type> knows how to open that type of URL
|
|
||||||
|
|
||||||
def open_http(self, url, data=None):
|
def open_http(self, url, data=None):
|
||||||
"""Use HTTP protocol."""
|
"""Use HTTP protocol."""
|
||||||
@@ -339,27 +113,6 @@ class URLopener:
|
|||||||
else:
|
else:
|
||||||
return self.http_error(url, fp, errcode, errmsg, headers, data)
|
return self.http_error(url, fp, errcode, errmsg, headers, data)
|
||||||
|
|
||||||
def http_error(self, url, fp, errcode, errmsg, headers, data=None):
|
|
||||||
"""Handle http errors.
|
|
||||||
Derived class can override this, or provide specific handlers
|
|
||||||
named http_error_DDD where DDD is the 3-digit error code."""
|
|
||||||
# First check if there's a specific handler for this error
|
|
||||||
name = 'http_error_%d' % errcode
|
|
||||||
if hasattr(self, name):
|
|
||||||
method = getattr(self, name)
|
|
||||||
if data is None:
|
|
||||||
result = method(url, fp, errcode, errmsg, headers)
|
|
||||||
else:
|
|
||||||
result = method(url, fp, errcode, errmsg, headers, data)
|
|
||||||
if result: return result
|
|
||||||
return self.http_error_default(url, fp, errcode, errmsg, headers)
|
|
||||||
|
|
||||||
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
|
||||||
"""Default error handler: close the connection and raise IOError."""
|
|
||||||
void = fp.read()
|
|
||||||
fp.close()
|
|
||||||
raise IOError, ('http error', errcode, errmsg, headers)
|
|
||||||
|
|
||||||
if hasattr(socket, "ssl"):
|
if hasattr(socket, "ssl"):
|
||||||
def open_https(self, url, data=None):
|
def open_https(self, url, data=None):
|
||||||
"""Use HTTPS protocol."""
|
"""Use HTTPS protocol."""
|
||||||
@@ -449,15 +202,6 @@ class URLopener:
|
|||||||
fp = gopherlib.send_selector(selector, host)
|
fp = gopherlib.send_selector(selector, host)
|
||||||
return addinfourl(fp, noheaders(), "gopher:" + url)
|
return addinfourl(fp, noheaders(), "gopher:" + url)
|
||||||
|
|
||||||
def open_file(self, url):
|
|
||||||
"""Use local file or FTP depending on form of URL."""
|
|
||||||
if not isinstance(url, str):
|
|
||||||
raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
|
|
||||||
if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
|
|
||||||
return self.open_ftp(url)
|
|
||||||
else:
|
|
||||||
return self.open_local_file(url)
|
|
||||||
|
|
||||||
def open_local_file(self, url):
|
def open_local_file(self, url):
|
||||||
"""Use local file."""
|
"""Use local file."""
|
||||||
import mimetypes, mimetools, email.Utils
|
import mimetypes, mimetools, email.Utils
|
||||||
@@ -555,53 +299,7 @@ class URLopener:
|
|||||||
except ftperrors(), msg:
|
except ftperrors(), msg:
|
||||||
raise IOError, ('ftp error', msg), sys.exc_info()[2]
|
raise IOError, ('ftp error', msg), sys.exc_info()[2]
|
||||||
|
|
||||||
def open_data(self, url, data=None):
|
# this one is copied verbatim
|
||||||
"""Use "data" URL."""
|
|
||||||
if not isinstance(url, str):
|
|
||||||
raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
|
|
||||||
# ignore POSTed data
|
|
||||||
#
|
|
||||||
# syntax of data URLs:
|
|
||||||
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
|
|
||||||
# mediatype := [ type "/" subtype ] *( ";" parameter )
|
|
||||||
# data := *urlchar
|
|
||||||
# parameter := attribute "=" value
|
|
||||||
import mimetools
|
|
||||||
try:
|
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
from StringIO import StringIO
|
|
||||||
try:
|
|
||||||
[type, data] = url.split(',', 1)
|
|
||||||
except ValueError:
|
|
||||||
raise IOError, ('data error', 'bad data URL')
|
|
||||||
if not type:
|
|
||||||
type = 'text/plain;charset=US-ASCII'
|
|
||||||
semi = type.rfind(';')
|
|
||||||
if semi >= 0 and '=' not in type[semi:]:
|
|
||||||
encoding = type[semi+1:]
|
|
||||||
type = type[:semi]
|
|
||||||
else:
|
|
||||||
encoding = ''
|
|
||||||
msg = []
|
|
||||||
msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
|
|
||||||
time.gmtime(time.time())))
|
|
||||||
msg.append('Content-type: %s' % type)
|
|
||||||
if encoding == 'base64':
|
|
||||||
import base64
|
|
||||||
data = base64.decodestring(data)
|
|
||||||
else:
|
|
||||||
data = unquote(data)
|
|
||||||
msg.append('Content-Length: %d' % len(data))
|
|
||||||
msg.append('')
|
|
||||||
msg.append(data)
|
|
||||||
msg = '\n'.join(msg)
|
|
||||||
f = StringIO(msg)
|
|
||||||
headers = mimetools.Message(f, 0)
|
|
||||||
#f.fileno = None # needed for addinfourl
|
|
||||||
return addinfourl(f, headers, url)
|
|
||||||
|
|
||||||
|
|
||||||
class FancyURLopener(URLopener):
|
class FancyURLopener(URLopener):
|
||||||
"""Derived class with handlers for errors we can handle (perhaps)."""
|
"""Derived class with handlers for errors we can handle (perhaps)."""
|
||||||
|
|
||||||
@@ -815,34 +513,12 @@ def ftperrors():
|
|||||||
_ftperrors = ftplib.all_errors
|
_ftperrors = ftplib.all_errors
|
||||||
return _ftperrors
|
return _ftperrors
|
||||||
|
|
||||||
_noheaders = None
|
|
||||||
def noheaders():
|
|
||||||
"""Return an empty mimetools.Message object."""
|
|
||||||
global _noheaders
|
|
||||||
if _noheaders is None:
|
|
||||||
import mimetools
|
|
||||||
try:
|
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
from StringIO import StringIO
|
|
||||||
_noheaders = mimetools.Message(StringIO(), 0)
|
|
||||||
_noheaders.fp.close() # Recycle file descriptor
|
|
||||||
return _noheaders
|
|
||||||
|
|
||||||
|
|
||||||
# Utility classes
|
# Utility classes
|
||||||
|
|
||||||
class ftpwrapper:
|
class ftpwrapper(urllib.ftpwrapper):
|
||||||
"""Class used by open_ftp() for cache of open FTP connections."""
|
"""Class used by open_ftp() for cache of open FTP connections."""
|
||||||
|
|
||||||
def __init__(self, user, passwd, host, port, dirs):
|
|
||||||
self.user = user
|
|
||||||
self.passwd = passwd
|
|
||||||
self.host = host
|
|
||||||
self.port = port
|
|
||||||
self.dirs = dirs
|
|
||||||
self.init()
|
|
||||||
|
|
||||||
def init(self):
|
def init(self):
|
||||||
from eventlet.green import ftplib
|
from eventlet.green import ftplib
|
||||||
self.busy = 0
|
self.busy = 0
|
||||||
@@ -882,573 +558,6 @@ class ftpwrapper:
|
|||||||
# Pass back both a suitably decorated object and a retrieval length
|
# Pass back both a suitably decorated object and a retrieval length
|
||||||
return (addclosehook(conn[0].makefile('rb'),
|
return (addclosehook(conn[0].makefile('rb'),
|
||||||
self.endtransfer), conn[1])
|
self.endtransfer), conn[1])
|
||||||
def endtransfer(self):
|
|
||||||
if not self.busy:
|
|
||||||
return
|
|
||||||
self.busy = 0
|
|
||||||
try:
|
|
||||||
self.ftp.voidresp()
|
|
||||||
except ftperrors():
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.endtransfer()
|
|
||||||
try:
|
|
||||||
self.ftp.close()
|
|
||||||
except ftperrors():
|
|
||||||
pass
|
|
||||||
|
|
||||||
class addbase:
|
|
||||||
"""Base class for addinfo and addclosehook."""
|
|
||||||
|
|
||||||
def __init__(self, fp):
|
|
||||||
self.fp = fp
|
|
||||||
self.read = self.fp.read
|
|
||||||
self.readline = self.fp.readline
|
|
||||||
if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
|
|
||||||
if hasattr(self.fp, "fileno"):
|
|
||||||
self.fileno = self.fp.fileno
|
|
||||||
else:
|
|
||||||
self.fileno = lambda: None
|
|
||||||
if hasattr(self.fp, "__iter__"):
|
|
||||||
self.__iter__ = self.fp.__iter__
|
|
||||||
if hasattr(self.fp, "next"):
|
|
||||||
self.next = self.fp.next
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
|
|
||||||
id(self), self.fp)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.read = None
|
|
||||||
self.readline = None
|
|
||||||
self.readlines = None
|
|
||||||
self.fileno = None
|
|
||||||
if self.fp: self.fp.close()
|
|
||||||
self.fp = None
|
|
||||||
|
|
||||||
class addclosehook(addbase):
|
|
||||||
"""Class to add a close hook to an open file."""
|
|
||||||
|
|
||||||
def __init__(self, fp, closehook, *hookargs):
|
|
||||||
addbase.__init__(self, fp)
|
|
||||||
self.closehook = closehook
|
|
||||||
self.hookargs = hookargs
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
addbase.close(self)
|
|
||||||
if self.closehook:
|
|
||||||
self.closehook(*self.hookargs)
|
|
||||||
self.closehook = None
|
|
||||||
self.hookargs = None
|
|
||||||
|
|
||||||
class addinfo(addbase):
|
|
||||||
"""class to add an info() method to an open file."""
|
|
||||||
|
|
||||||
def __init__(self, fp, headers):
|
|
||||||
addbase.__init__(self, fp)
|
|
||||||
self.headers = headers
|
|
||||||
|
|
||||||
def info(self):
|
|
||||||
return self.headers
|
|
||||||
|
|
||||||
class addinfourl(addbase):
|
|
||||||
"""class to add info() and geturl() methods to an open file."""
|
|
||||||
|
|
||||||
def __init__(self, fp, headers, url):
|
|
||||||
addbase.__init__(self, fp)
|
|
||||||
self.headers = headers
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
def info(self):
|
|
||||||
return self.headers
|
|
||||||
|
|
||||||
def geturl(self):
|
|
||||||
return self.url
|
|
||||||
|
|
||||||
|
|
||||||
# Utilities to parse URLs (most of these return None for missing parts):
|
|
||||||
# unwrap('<URL:type://host/path>') --> 'type://host/path'
|
|
||||||
# splittype('type:opaquestring') --> 'type', 'opaquestring'
|
|
||||||
# splithost('//host[:port]/path') --> 'host[:port]', '/path'
|
|
||||||
# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
|
|
||||||
# splitpasswd('user:passwd') -> 'user', 'passwd'
|
|
||||||
# splitport('host:port') --> 'host', 'port'
|
|
||||||
# splitquery('/path?query') --> '/path', 'query'
|
|
||||||
# splittag('/path#tag') --> '/path', 'tag'
|
|
||||||
# splitattr('/path;attr1=value1;attr2=value2;...') ->
|
|
||||||
# '/path', ['attr1=value1', 'attr2=value2', ...]
|
|
||||||
# splitvalue('attr=value') --> 'attr', 'value'
|
|
||||||
# splitgophertype('/Xselector') --> 'X', 'selector'
|
|
||||||
# unquote('abc%20def') -> 'abc def'
|
|
||||||
# quote('abc def') -> 'abc%20def')
|
|
||||||
|
|
||||||
try:
|
|
||||||
unicode
|
|
||||||
except NameError:
|
|
||||||
def _is_unicode(x):
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
def _is_unicode(x):
|
|
||||||
return isinstance(x, unicode)
|
|
||||||
|
|
||||||
def toBytes(url):
|
|
||||||
"""toBytes(u"URL") --> 'URL'."""
|
|
||||||
# Most URL schemes require ASCII. If that changes, the conversion
|
|
||||||
# can be relaxed
|
|
||||||
if _is_unicode(url):
|
|
||||||
try:
|
|
||||||
url = url.encode("ASCII")
|
|
||||||
except UnicodeError:
|
|
||||||
raise UnicodeError("URL " + repr(url) +
|
|
||||||
" contains non-ASCII characters")
|
|
||||||
return url
|
|
||||||
|
|
||||||
def unwrap(url):
|
|
||||||
"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
|
|
||||||
url = url.strip()
|
|
||||||
if url[:1] == '<' and url[-1:] == '>':
|
|
||||||
url = url[1:-1].strip()
|
|
||||||
if url[:4] == 'URL:': url = url[4:].strip()
|
|
||||||
return url
|
|
||||||
|
|
||||||
_typeprog = None
|
|
||||||
def splittype(url):
|
|
||||||
"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
|
|
||||||
global _typeprog
|
|
||||||
if _typeprog is None:
|
|
||||||
import re
|
|
||||||
_typeprog = re.compile('^([^/:]+):')
|
|
||||||
|
|
||||||
match = _typeprog.match(url)
|
|
||||||
if match:
|
|
||||||
scheme = match.group(1)
|
|
||||||
return scheme.lower(), url[len(scheme) + 1:]
|
|
||||||
return None, url
|
|
||||||
|
|
||||||
_hostprog = None
|
|
||||||
def splithost(url):
|
|
||||||
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
|
|
||||||
global _hostprog
|
|
||||||
if _hostprog is None:
|
|
||||||
import re
|
|
||||||
_hostprog = re.compile('^//([^/?]*)(.*)$')
|
|
||||||
|
|
||||||
match = _hostprog.match(url)
|
|
||||||
if match: return match.group(1, 2)
|
|
||||||
return None, url
|
|
||||||
|
|
||||||
_userprog = None
|
|
||||||
def splituser(host):
|
|
||||||
"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
|
|
||||||
global _userprog
|
|
||||||
if _userprog is None:
|
|
||||||
import re
|
|
||||||
_userprog = re.compile('^(.*)@(.*)$')
|
|
||||||
|
|
||||||
match = _userprog.match(host)
|
|
||||||
if match: return map(unquote, match.group(1, 2))
|
|
||||||
return None, host
|
|
||||||
|
|
||||||
_passwdprog = None
|
|
||||||
def splitpasswd(user):
|
|
||||||
"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
|
|
||||||
global _passwdprog
|
|
||||||
if _passwdprog is None:
|
|
||||||
import re
|
|
||||||
_passwdprog = re.compile('^([^:]*):(.*)$')
|
|
||||||
|
|
||||||
match = _passwdprog.match(user)
|
|
||||||
if match: return match.group(1, 2)
|
|
||||||
return user, None
|
|
||||||
|
|
||||||
# splittag('/path#tag') --> '/path', 'tag'
|
|
||||||
_portprog = None
|
|
||||||
def splitport(host):
|
|
||||||
"""splitport('host:port') --> 'host', 'port'."""
|
|
||||||
global _portprog
|
|
||||||
if _portprog is None:
|
|
||||||
import re
|
|
||||||
_portprog = re.compile('^(.*):([0-9]+)$')
|
|
||||||
|
|
||||||
match = _portprog.match(host)
|
|
||||||
if match: return match.group(1, 2)
|
|
||||||
return host, None
|
|
||||||
|
|
||||||
_nportprog = None
|
|
||||||
def splitnport(host, defport=-1):
|
|
||||||
"""Split host and port, returning numeric port.
|
|
||||||
Return given default port if no ':' found; defaults to -1.
|
|
||||||
Return numerical port if a valid number are found after ':'.
|
|
||||||
Return None if ':' but not a valid number."""
|
|
||||||
global _nportprog
|
|
||||||
if _nportprog is None:
|
|
||||||
import re
|
|
||||||
_nportprog = re.compile('^(.*):(.*)$')
|
|
||||||
|
|
||||||
match = _nportprog.match(host)
|
|
||||||
if match:
|
|
||||||
host, port = match.group(1, 2)
|
|
||||||
try:
|
|
||||||
if not port: raise ValueError, "no digits"
|
|
||||||
nport = int(port)
|
|
||||||
except ValueError:
|
|
||||||
nport = None
|
|
||||||
return host, nport
|
|
||||||
return host, defport
|
|
||||||
|
|
||||||
_queryprog = None
|
|
||||||
def splitquery(url):
|
|
||||||
"""splitquery('/path?query') --> '/path', 'query'."""
|
|
||||||
global _queryprog
|
|
||||||
if _queryprog is None:
|
|
||||||
import re
|
|
||||||
_queryprog = re.compile('^(.*)\?([^?]*)$')
|
|
||||||
|
|
||||||
match = _queryprog.match(url)
|
|
||||||
if match: return match.group(1, 2)
|
|
||||||
return url, None
|
|
||||||
|
|
||||||
_tagprog = None
|
|
||||||
def splittag(url):
|
|
||||||
"""splittag('/path#tag') --> '/path', 'tag'."""
|
|
||||||
global _tagprog
|
|
||||||
if _tagprog is None:
|
|
||||||
import re
|
|
||||||
_tagprog = re.compile('^(.*)#([^#]*)$')
|
|
||||||
|
|
||||||
match = _tagprog.match(url)
|
|
||||||
if match: return match.group(1, 2)
|
|
||||||
return url, None
|
|
||||||
|
|
||||||
def splitattr(url):
|
|
||||||
"""splitattr('/path;attr1=value1;attr2=value2;...') ->
|
|
||||||
'/path', ['attr1=value1', 'attr2=value2', ...]."""
|
|
||||||
words = url.split(';')
|
|
||||||
return words[0], words[1:]
|
|
||||||
|
|
||||||
_valueprog = None
|
|
||||||
def splitvalue(attr):
|
|
||||||
"""splitvalue('attr=value') --> 'attr', 'value'."""
|
|
||||||
global _valueprog
|
|
||||||
if _valueprog is None:
|
|
||||||
import re
|
|
||||||
_valueprog = re.compile('^([^=]*)=(.*)$')
|
|
||||||
|
|
||||||
match = _valueprog.match(attr)
|
|
||||||
if match: return match.group(1, 2)
|
|
||||||
return attr, None
|
|
||||||
|
|
||||||
def splitgophertype(selector):
|
|
||||||
"""splitgophertype('/Xselector') --> 'X', 'selector'."""
|
|
||||||
if selector[:1] == '/' and selector[1:2]:
|
|
||||||
return selector[1], selector[2:]
|
|
||||||
return None, selector
|
|
||||||
|
|
||||||
_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
|
|
||||||
_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
|
|
||||||
|
|
||||||
def unquote(s):
|
|
||||||
"""unquote('abc%20def') -> 'abc def'."""
|
|
||||||
res = s.split('%')
|
|
||||||
for i in xrange(1, len(res)):
|
|
||||||
item = res[i]
|
|
||||||
try:
|
|
||||||
res[i] = _hextochr[item[:2]] + item[2:]
|
|
||||||
except KeyError:
|
|
||||||
res[i] = '%' + item
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
res[i] = unichr(int(item[:2], 16)) + item[2:]
|
|
||||||
return "".join(res)
|
|
||||||
|
|
||||||
def unquote_plus(s):
|
|
||||||
"""unquote('%7e/abc+def') -> '~/abc def'"""
|
|
||||||
s = s.replace('+', ' ')
|
|
||||||
return unquote(s)
|
|
||||||
|
|
||||||
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
||||||
'abcdefghijklmnopqrstuvwxyz'
|
|
||||||
'0123456789' '_.-')
|
|
||||||
_safemaps = {}
|
|
||||||
|
|
||||||
def quote(s, safe = '/'):
|
|
||||||
"""quote('abc def') -> 'abc%20def'
|
|
||||||
|
|
||||||
Each part of a URL, e.g. the path info, the query, etc., has a
|
|
||||||
different set of reserved characters that must be quoted.
|
|
||||||
|
|
||||||
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
|
|
||||||
the following reserved characters.
|
|
||||||
|
|
||||||
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
|
||||||
"$" | ","
|
|
||||||
|
|
||||||
Each of these characters is reserved in some component of a URL,
|
|
||||||
but not necessarily in all of them.
|
|
||||||
|
|
||||||
By default, the quote function is intended for quoting the path
|
|
||||||
section of a URL. Thus, it will not encode '/'. This character
|
|
||||||
is reserved, but in typical usage the quote function is being
|
|
||||||
called on a path where the existing slash characters are used as
|
|
||||||
reserved characters.
|
|
||||||
"""
|
|
||||||
cachekey = (safe, always_safe)
|
|
||||||
try:
|
|
||||||
safe_map = _safemaps[cachekey]
|
|
||||||
except KeyError:
|
|
||||||
safe += always_safe
|
|
||||||
safe_map = {}
|
|
||||||
for i in range(256):
|
|
||||||
c = chr(i)
|
|
||||||
safe_map[c] = (c in safe) and c or ('%%%02X' % i)
|
|
||||||
_safemaps[cachekey] = safe_map
|
|
||||||
res = map(safe_map.__getitem__, s)
|
|
||||||
return ''.join(res)
|
|
||||||
|
|
||||||
def quote_plus(s, safe = ''):
|
|
||||||
"""Quote the query fragment of a URL; replacing ' ' with '+'"""
|
|
||||||
if ' ' in s:
|
|
||||||
s = quote(s, safe + ' ')
|
|
||||||
return s.replace(' ', '+')
|
|
||||||
return quote(s, safe)
|
|
||||||
|
|
||||||
def urlencode(query,doseq=0):
|
|
||||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
|
||||||
|
|
||||||
If any values in the query arg are sequences and doseq is true, each
|
|
||||||
sequence element is converted to a separate parameter.
|
|
||||||
|
|
||||||
If the query arg is a sequence of two-element tuples, the order of the
|
|
||||||
parameters in the output will match the order of parameters in the
|
|
||||||
input.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if hasattr(query,"items"):
|
|
||||||
# mapping objects
|
|
||||||
query = query.items()
|
|
||||||
else:
|
|
||||||
# it's a bother at times that strings and string-like objects are
|
|
||||||
# sequences...
|
|
||||||
try:
|
|
||||||
# non-sequence items should not work with len()
|
|
||||||
# non-empty strings will fail this
|
|
||||||
if len(query) and not isinstance(query[0], tuple):
|
|
||||||
raise TypeError
|
|
||||||
# zero-length sequences of all types will get here and succeed,
|
|
||||||
# but that's a minor nit - since the original implementation
|
|
||||||
# allowed empty dicts that type of behavior probably should be
|
|
||||||
# preserved for consistency
|
|
||||||
except TypeError:
|
|
||||||
ty,va,tb = sys.exc_info()
|
|
||||||
raise TypeError, "not a valid non-string sequence or mapping object", tb
|
|
||||||
|
|
||||||
l = []
|
|
||||||
if not doseq:
|
|
||||||
# preserve old behavior
|
|
||||||
for k, v in query:
|
|
||||||
k = quote_plus(str(k))
|
|
||||||
v = quote_plus(str(v))
|
|
||||||
l.append(k + '=' + v)
|
|
||||||
else:
|
|
||||||
for k, v in query:
|
|
||||||
k = quote_plus(str(k))
|
|
||||||
if isinstance(v, str):
|
|
||||||
v = quote_plus(v)
|
|
||||||
l.append(k + '=' + v)
|
|
||||||
elif _is_unicode(v):
|
|
||||||
# is there a reasonable way to convert to ASCII?
|
|
||||||
# encode generates a string, but "replace" or "ignore"
|
|
||||||
# lose information and "strict" can raise UnicodeError
|
|
||||||
v = quote_plus(v.encode("ASCII","replace"))
|
|
||||||
l.append(k + '=' + v)
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
# is this a sufficient test for sequence-ness?
|
|
||||||
x = len(v)
|
|
||||||
except TypeError:
|
|
||||||
# not a sequence
|
|
||||||
v = quote_plus(str(v))
|
|
||||||
l.append(k + '=' + v)
|
|
||||||
else:
|
|
||||||
# loop over the sequence
|
|
||||||
for elt in v:
|
|
||||||
l.append(k + '=' + quote_plus(str(elt)))
|
|
||||||
return '&'.join(l)
|
|
||||||
|
|
||||||
# Proxy handling
|
|
||||||
def getproxies_environment():
|
|
||||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
|
||||||
|
|
||||||
Scan the environment for variables named <scheme>_proxy;
|
|
||||||
this seems to be the standard convention. If you need a
|
|
||||||
different way, you can pass a proxies dictionary to the
|
|
||||||
[Fancy]URLopener constructor.
|
|
||||||
|
|
||||||
"""
|
|
||||||
proxies = {}
|
|
||||||
for name, value in os.environ.items():
|
|
||||||
name = name.lower()
|
|
||||||
if value and name[-6:] == '_proxy':
|
|
||||||
proxies[name[:-6]] = value
|
|
||||||
return proxies
|
|
||||||
|
|
||||||
if sys.platform == 'darwin':
|
|
||||||
def getproxies_internetconfig():
|
|
||||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
|
||||||
|
|
||||||
By convention the mac uses Internet Config to store
|
|
||||||
proxies. An HTTP proxy, for instance, is stored under
|
|
||||||
the HttpProxy key.
|
|
||||||
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import ic
|
|
||||||
except ImportError:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
config = ic.IC()
|
|
||||||
except ic.error:
|
|
||||||
return {}
|
|
||||||
proxies = {}
|
|
||||||
# HTTP:
|
|
||||||
if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
|
|
||||||
try:
|
|
||||||
value = config['HTTPProxyHost']
|
|
||||||
except ic.error:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
proxies['http'] = 'http://%s' % value
|
|
||||||
# FTP: XXXX To be done.
|
|
||||||
# Gopher: XXXX To be done.
|
|
||||||
return proxies
|
|
||||||
|
|
||||||
def proxy_bypass(x):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def getproxies():
|
|
||||||
return getproxies_environment() or getproxies_internetconfig()
|
|
||||||
|
|
||||||
elif os.name == 'nt':
|
|
||||||
def getproxies_registry():
|
|
||||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
|
||||||
|
|
||||||
Win32 uses the registry to store proxies.
|
|
||||||
|
|
||||||
"""
|
|
||||||
proxies = {}
|
|
||||||
try:
|
|
||||||
import _winreg
|
|
||||||
except ImportError:
|
|
||||||
# Std module, so should be around - but you never know!
|
|
||||||
return proxies
|
|
||||||
try:
|
|
||||||
internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
|
|
||||||
r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
|
|
||||||
proxyEnable = _winreg.QueryValueEx(internetSettings,
|
|
||||||
'ProxyEnable')[0]
|
|
||||||
if proxyEnable:
|
|
||||||
# Returned as Unicode but problems if not converted to ASCII
|
|
||||||
proxyServer = str(_winreg.QueryValueEx(internetSettings,
|
|
||||||
'ProxyServer')[0])
|
|
||||||
if '=' in proxyServer:
|
|
||||||
# Per-protocol settings
|
|
||||||
for p in proxyServer.split(';'):
|
|
||||||
protocol, address = p.split('=', 1)
|
|
||||||
# See if address has a type:// prefix
|
|
||||||
import re
|
|
||||||
if not re.match('^([^/:]+)://', address):
|
|
||||||
address = '%s://%s' % (protocol, address)
|
|
||||||
proxies[protocol] = address
|
|
||||||
else:
|
|
||||||
# Use one setting for all protocols
|
|
||||||
if proxyServer[:5] == 'http:':
|
|
||||||
proxies['http'] = proxyServer
|
|
||||||
else:
|
|
||||||
proxies['http'] = 'http://%s' % proxyServer
|
|
||||||
proxies['ftp'] = 'ftp://%s' % proxyServer
|
|
||||||
internetSettings.Close()
|
|
||||||
except (WindowsError, ValueError, TypeError):
|
|
||||||
# Either registry key not found etc, or the value in an
|
|
||||||
# unexpected format.
|
|
||||||
# proxies already set up to be empty so nothing to do
|
|
||||||
pass
|
|
||||||
return proxies
|
|
||||||
|
|
||||||
def getproxies():
|
|
||||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
|
||||||
|
|
||||||
Returns settings gathered from the environment, if specified,
|
|
||||||
or the registry.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return getproxies_environment() or getproxies_registry()
|
|
||||||
|
|
||||||
def proxy_bypass(host):
|
|
||||||
try:
|
|
||||||
import _winreg
|
|
||||||
import re
|
|
||||||
except ImportError:
|
|
||||||
# Std modules, so should be around - but you never know!
|
|
||||||
return 0
|
|
||||||
try:
|
|
||||||
internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
|
|
||||||
r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
|
|
||||||
proxyEnable = _winreg.QueryValueEx(internetSettings,
|
|
||||||
'ProxyEnable')[0]
|
|
||||||
proxyOverride = str(_winreg.QueryValueEx(internetSettings,
|
|
||||||
'ProxyOverride')[0])
|
|
||||||
# ^^^^ Returned as Unicode but problems if not converted to ASCII
|
|
||||||
except WindowsError:
|
|
||||||
return 0
|
|
||||||
if not proxyEnable or not proxyOverride:
|
|
||||||
return 0
|
|
||||||
# try to make a host list from name and IP address.
|
|
||||||
rawHost, port = splitport(host)
|
|
||||||
host = [rawHost]
|
|
||||||
try:
|
|
||||||
addr = socket.gethostbyname(rawHost)
|
|
||||||
if addr != rawHost:
|
|
||||||
host.append(addr)
|
|
||||||
except socket.error:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
fqdn = socket.getfqdn(rawHost)
|
|
||||||
if fqdn != rawHost:
|
|
||||||
host.append(fqdn)
|
|
||||||
except socket.error:
|
|
||||||
pass
|
|
||||||
# make a check value list from the registry entry: replace the
|
|
||||||
# '<local>' string by the localhost entry and the corresponding
|
|
||||||
# canonical entry.
|
|
||||||
proxyOverride = proxyOverride.split(';')
|
|
||||||
i = 0
|
|
||||||
while i < len(proxyOverride):
|
|
||||||
if proxyOverride[i] == '<local>':
|
|
||||||
proxyOverride[i:i+1] = ['localhost',
|
|
||||||
'127.0.0.1',
|
|
||||||
socket.gethostname(),
|
|
||||||
socket.gethostbyname(
|
|
||||||
socket.gethostname())]
|
|
||||||
i += 1
|
|
||||||
# print proxyOverride
|
|
||||||
# now check if we match one of the registry values.
|
|
||||||
for test in proxyOverride:
|
|
||||||
test = test.replace(".", r"\.") # mask dots
|
|
||||||
test = test.replace("*", r".*") # change glob sequence
|
|
||||||
test = test.replace("?", r".") # change glob char
|
|
||||||
for val in host:
|
|
||||||
# print "%s <--> %s" %( test, val )
|
|
||||||
if re.match(test, val, re.I):
|
|
||||||
return 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
else:
|
|
||||||
# By default use environment variables
|
|
||||||
getproxies = getproxies_environment
|
|
||||||
|
|
||||||
def proxy_bypass(host):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# Test and time quote() and unquote()
|
# Test and time quote() and unquote()
|
||||||
def test1():
|
def test1():
|
||||||
|
Reference in New Issue
Block a user