better integration of urllib: only redefine classes that use i/o
This commit is contained in:
@@ -1,71 +1,19 @@
|
||||
"""Open an arbitrary URL.
|
||||
from __future__ import absolute_import
|
||||
import urllib
|
||||
from urllib import *
|
||||
|
||||
See the following document for more info on URLs:
|
||||
"Names and Addresses, URIs, URLs, URNs, URCs", at
|
||||
http://www.w3.org/pub/WWW/Addressing/Overview.html
|
||||
# import the following to be a better drop-in replacement
|
||||
from urllib import (__all__, __version__, MAXFTPCACHE, ContentTooShortError,
|
||||
ftpcache, _noheaders, noheaders, addbase, addclosehook,
|
||||
addinfo, addinfourl, _is_unicode, toBytes, _hextochr,
|
||||
always_safe, getproxies_environment, proxy_bypass)
|
||||
|
||||
See also the HTTP spec (from which the error codes are derived):
|
||||
"HTTP - Hypertext Transfer Protocol", at
|
||||
http://www.w3.org/pub/WWW/Protocols/
|
||||
|
||||
Related standards and specs:
|
||||
- RFC1808: the "relative URL" spec. (authoritative status)
|
||||
- RFC1738 - the "URL standard". (authoritative status)
|
||||
- RFC1630 - the "URI spec". (informational status)
|
||||
|
||||
The object returned by URLopener().open(file) will differ per
|
||||
protocol. All you know is that is has methods read(), readline(),
|
||||
readlines(), fileno(), close() and info(). The read*(), fileno()
|
||||
and close() methods work like those of open files.
|
||||
The info() method returns a mimetools.Message object which can be
|
||||
used to query various info about the object, if available.
|
||||
(mimetools.Message objects are queried with the getheader() method.)
|
||||
"""
|
||||
|
||||
import string
|
||||
from eventlet.green import socket
|
||||
import os
|
||||
from eventlet.green import time
|
||||
import time
|
||||
import sys
|
||||
from urlparse import urljoin as basejoin
|
||||
|
||||
__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
|
||||
"urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
|
||||
"urlencode", "url2pathname", "pathname2url", "splittag",
|
||||
"localhost", "thishost", "ftperrors", "basejoin", "unwrap",
|
||||
"splittype", "splithost", "splituser", "splitpasswd", "splitport",
|
||||
"splitnport", "splitquery", "splitattr", "splitvalue",
|
||||
"splitgophertype", "getproxies"]
|
||||
|
||||
__version__ = '1.17' # XXX This version is not always updated :-(
|
||||
|
||||
MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
|
||||
|
||||
# Helper for non-unix systems
|
||||
if os.name == 'mac':
|
||||
from macurl2path import url2pathname, pathname2url
|
||||
elif os.name == 'nt':
|
||||
from nturl2path import url2pathname, pathname2url
|
||||
elif os.name == 'riscos':
|
||||
from rourl2path import url2pathname, pathname2url
|
||||
else:
|
||||
def url2pathname(pathname):
|
||||
"""OS-specific conversion from a relative URL of the 'file' scheme
|
||||
to a file system path; not recommended for general use."""
|
||||
return unquote(pathname)
|
||||
|
||||
def pathname2url(pathname):
|
||||
"""OS-specific conversion from a file system path to a relative URL
|
||||
of the 'file' scheme; not recommended for general use."""
|
||||
return quote(pathname)
|
||||
|
||||
# This really consists of two pieces:
|
||||
# (1) a class which handles opening of all sorts of URLs
|
||||
# (plus assorted utilities etc.)
|
||||
# (2) a set of functions for parsing URLs
|
||||
# XXX Should these be separated out into different modules?
|
||||
|
||||
|
||||
# Shortcut for basic usage
|
||||
_urlopener = None
|
||||
def urlopen(url, data=None, proxies=None):
|
||||
@@ -91,181 +39,7 @@ def urlcleanup():
|
||||
if _urlopener:
|
||||
_urlopener.cleanup()
|
||||
|
||||
# exception raised when downloaded size does not match content-length
|
||||
class ContentTooShortError(IOError):
|
||||
def __init__(self, message, content):
|
||||
IOError.__init__(self, message)
|
||||
self.content = content
|
||||
|
||||
ftpcache = {}
|
||||
class URLopener:
|
||||
"""Class to open URLs.
|
||||
This is a class rather than just a subroutine because we may need
|
||||
more than one set of global protocol-specific options.
|
||||
Note -- this is a base class for those who don't want the
|
||||
automatic handling of errors type 302 (relocated) and 401
|
||||
(authorization needed)."""
|
||||
|
||||
__tempfiles = None
|
||||
|
||||
version = "Python-urllib/%s" % __version__
|
||||
|
||||
# Constructor
|
||||
def __init__(self, proxies=None, **x509):
|
||||
if proxies is None:
|
||||
proxies = getproxies()
|
||||
assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
|
||||
self.proxies = proxies
|
||||
self.key_file = x509.get('key_file')
|
||||
self.cert_file = x509.get('cert_file')
|
||||
self.addheaders = [('User-Agent', self.version)]
|
||||
self.__tempfiles = []
|
||||
self.__unlink = os.unlink # See cleanup()
|
||||
self.tempcache = None
|
||||
# Undocumented feature: if you assign {} to tempcache,
|
||||
# it is used to cache files retrieved with
|
||||
# self.retrieve(). This is not enabled by default
|
||||
# since it does not work for changing documents (and I
|
||||
# haven't got the logic to check expiration headers
|
||||
# yet).
|
||||
self.ftpcache = ftpcache
|
||||
# Undocumented feature: you can use a different
|
||||
# ftp cache by assigning to the .ftpcache member;
|
||||
# in case you want logically independent URL openers
|
||||
# XXX This is not threadsafe. Bah.
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
self.cleanup()
|
||||
|
||||
def cleanup(self):
|
||||
# This code sometimes runs when the rest of this module
|
||||
# has already been deleted, so it can't use any globals
|
||||
# or import anything.
|
||||
if self.__tempfiles:
|
||||
for file in self.__tempfiles:
|
||||
try:
|
||||
self.__unlink(file)
|
||||
except OSError:
|
||||
pass
|
||||
del self.__tempfiles[:]
|
||||
if self.tempcache:
|
||||
self.tempcache.clear()
|
||||
|
||||
def addheader(self, *args):
|
||||
"""Add a header to be used by the HTTP interface only
|
||||
e.g. u.addheader('Accept', 'sound/basic')"""
|
||||
self.addheaders.append(args)
|
||||
|
||||
# External interface
|
||||
def open(self, fullurl, data=None):
|
||||
"""Use URLopener().open(file) instead of open(file, 'r')."""
|
||||
fullurl = unwrap(toBytes(fullurl))
|
||||
if self.tempcache and fullurl in self.tempcache:
|
||||
filename, headers = self.tempcache[fullurl]
|
||||
fp = open(filename, 'rb')
|
||||
return addinfourl(fp, headers, fullurl)
|
||||
urltype, url = splittype(fullurl)
|
||||
if not urltype:
|
||||
urltype = 'file'
|
||||
if urltype in self.proxies:
|
||||
proxy = self.proxies[urltype]
|
||||
urltype, proxyhost = splittype(proxy)
|
||||
host, selector = splithost(proxyhost)
|
||||
url = (host, fullurl) # Signal special case to open_*()
|
||||
else:
|
||||
proxy = None
|
||||
name = 'open_' + urltype
|
||||
self.type = urltype
|
||||
name = name.replace('-', '_')
|
||||
if not hasattr(self, name):
|
||||
if proxy:
|
||||
return self.open_unknown_proxy(proxy, fullurl, data)
|
||||
else:
|
||||
return self.open_unknown(fullurl, data)
|
||||
try:
|
||||
if data is None:
|
||||
return getattr(self, name)(url)
|
||||
else:
|
||||
return getattr(self, name)(url, data)
|
||||
except socket.error, msg:
|
||||
raise IOError, ('socket error', msg), sys.exc_info()[2]
|
||||
|
||||
def open_unknown(self, fullurl, data=None):
|
||||
"""Overridable interface to open unknown URL type."""
|
||||
type, url = splittype(fullurl)
|
||||
raise IOError, ('url error', 'unknown url type', type)
|
||||
|
||||
def open_unknown_proxy(self, proxy, fullurl, data=None):
|
||||
"""Overridable interface to open unknown URL type."""
|
||||
type, url = splittype(fullurl)
|
||||
raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
|
||||
|
||||
# External interface
|
||||
def retrieve(self, url, filename=None, reporthook=None, data=None):
|
||||
"""retrieve(url) returns (filename, headers) for a local object
|
||||
or (tempfilename, headers) for a remote object."""
|
||||
url = unwrap(toBytes(url))
|
||||
if self.tempcache and url in self.tempcache:
|
||||
return self.tempcache[url]
|
||||
type, url1 = splittype(url)
|
||||
if filename is None and (not type or type == 'file'):
|
||||
try:
|
||||
fp = self.open_local_file(url1)
|
||||
hdrs = fp.info()
|
||||
del fp
|
||||
return url2pathname(splithost(url1)[1]), hdrs
|
||||
except IOError, msg:
|
||||
pass
|
||||
fp = self.open(url, data)
|
||||
headers = fp.info()
|
||||
if filename:
|
||||
tfp = open(filename, 'wb')
|
||||
else:
|
||||
import tempfile
|
||||
garbage, path = splittype(url)
|
||||
garbage, path = splithost(path or "")
|
||||
path, garbage = splitquery(path or "")
|
||||
path, garbage = splitattr(path or "")
|
||||
suffix = os.path.splitext(path)[1]
|
||||
(fd, filename) = tempfile.mkstemp(suffix)
|
||||
self.__tempfiles.append(filename)
|
||||
tfp = os.fdopen(fd, 'wb')
|
||||
result = filename, headers
|
||||
if self.tempcache is not None:
|
||||
self.tempcache[url] = result
|
||||
bs = 1024*8
|
||||
size = -1
|
||||
read = 0
|
||||
blocknum = 0
|
||||
if reporthook:
|
||||
if "content-length" in headers:
|
||||
size = int(headers["Content-Length"])
|
||||
reporthook(blocknum, bs, size)
|
||||
while 1:
|
||||
block = fp.read(bs)
|
||||
if block == "":
|
||||
break
|
||||
read += len(block)
|
||||
tfp.write(block)
|
||||
blocknum += 1
|
||||
if reporthook:
|
||||
reporthook(blocknum, bs, size)
|
||||
fp.close()
|
||||
tfp.close()
|
||||
del fp
|
||||
del tfp
|
||||
|
||||
# raise exception if actual size does not match content-length header
|
||||
if size >= 0 and read < size:
|
||||
raise ContentTooShortError("retrieval incomplete: got only %i out "
|
||||
"of %i bytes" % (read, size), result)
|
||||
|
||||
return result
|
||||
|
||||
# Each method named open_<type> knows how to open that type of URL
|
||||
class URLopener(urllib.URLopener):
|
||||
|
||||
def open_http(self, url, data=None):
|
||||
"""Use HTTP protocol."""
|
||||
@@ -339,27 +113,6 @@ class URLopener:
|
||||
else:
|
||||
return self.http_error(url, fp, errcode, errmsg, headers, data)
|
||||
|
||||
def http_error(self, url, fp, errcode, errmsg, headers, data=None):
|
||||
"""Handle http errors.
|
||||
Derived class can override this, or provide specific handlers
|
||||
named http_error_DDD where DDD is the 3-digit error code."""
|
||||
# First check if there's a specific handler for this error
|
||||
name = 'http_error_%d' % errcode
|
||||
if hasattr(self, name):
|
||||
method = getattr(self, name)
|
||||
if data is None:
|
||||
result = method(url, fp, errcode, errmsg, headers)
|
||||
else:
|
||||
result = method(url, fp, errcode, errmsg, headers, data)
|
||||
if result: return result
|
||||
return self.http_error_default(url, fp, errcode, errmsg, headers)
|
||||
|
||||
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
||||
"""Default error handler: close the connection and raise IOError."""
|
||||
void = fp.read()
|
||||
fp.close()
|
||||
raise IOError, ('http error', errcode, errmsg, headers)
|
||||
|
||||
if hasattr(socket, "ssl"):
|
||||
def open_https(self, url, data=None):
|
||||
"""Use HTTPS protocol."""
|
||||
@@ -449,15 +202,6 @@ class URLopener:
|
||||
fp = gopherlib.send_selector(selector, host)
|
||||
return addinfourl(fp, noheaders(), "gopher:" + url)
|
||||
|
||||
def open_file(self, url):
|
||||
"""Use local file or FTP depending on form of URL."""
|
||||
if not isinstance(url, str):
|
||||
raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
|
||||
if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
|
||||
return self.open_ftp(url)
|
||||
else:
|
||||
return self.open_local_file(url)
|
||||
|
||||
def open_local_file(self, url):
|
||||
"""Use local file."""
|
||||
import mimetypes, mimetools, email.Utils
|
||||
@@ -555,53 +299,7 @@ class URLopener:
|
||||
except ftperrors(), msg:
|
||||
raise IOError, ('ftp error', msg), sys.exc_info()[2]
|
||||
|
||||
def open_data(self, url, data=None):
|
||||
"""Use "data" URL."""
|
||||
if not isinstance(url, str):
|
||||
raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
|
||||
# ignore POSTed data
|
||||
#
|
||||
# syntax of data URLs:
|
||||
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
|
||||
# mediatype := [ type "/" subtype ] *( ";" parameter )
|
||||
# data := *urlchar
|
||||
# parameter := attribute "=" value
|
||||
import mimetools
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
try:
|
||||
[type, data] = url.split(',', 1)
|
||||
except ValueError:
|
||||
raise IOError, ('data error', 'bad data URL')
|
||||
if not type:
|
||||
type = 'text/plain;charset=US-ASCII'
|
||||
semi = type.rfind(';')
|
||||
if semi >= 0 and '=' not in type[semi:]:
|
||||
encoding = type[semi+1:]
|
||||
type = type[:semi]
|
||||
else:
|
||||
encoding = ''
|
||||
msg = []
|
||||
msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
|
||||
time.gmtime(time.time())))
|
||||
msg.append('Content-type: %s' % type)
|
||||
if encoding == 'base64':
|
||||
import base64
|
||||
data = base64.decodestring(data)
|
||||
else:
|
||||
data = unquote(data)
|
||||
msg.append('Content-Length: %d' % len(data))
|
||||
msg.append('')
|
||||
msg.append(data)
|
||||
msg = '\n'.join(msg)
|
||||
f = StringIO(msg)
|
||||
headers = mimetools.Message(f, 0)
|
||||
#f.fileno = None # needed for addinfourl
|
||||
return addinfourl(f, headers, url)
|
||||
|
||||
|
||||
# this one is copied verbatim
|
||||
class FancyURLopener(URLopener):
|
||||
"""Derived class with handlers for errors we can handle (perhaps)."""
|
||||
|
||||
@@ -815,34 +513,12 @@ def ftperrors():
|
||||
_ftperrors = ftplib.all_errors
|
||||
return _ftperrors
|
||||
|
||||
_noheaders = None
|
||||
def noheaders():
|
||||
"""Return an empty mimetools.Message object."""
|
||||
global _noheaders
|
||||
if _noheaders is None:
|
||||
import mimetools
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
_noheaders = mimetools.Message(StringIO(), 0)
|
||||
_noheaders.fp.close() # Recycle file descriptor
|
||||
return _noheaders
|
||||
|
||||
|
||||
# Utility classes
|
||||
|
||||
class ftpwrapper:
|
||||
class ftpwrapper(urllib.ftpwrapper):
|
||||
"""Class used by open_ftp() for cache of open FTP connections."""
|
||||
|
||||
def __init__(self, user, passwd, host, port, dirs):
|
||||
self.user = user
|
||||
self.passwd = passwd
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.dirs = dirs
|
||||
self.init()
|
||||
|
||||
def init(self):
|
||||
from eventlet.green import ftplib
|
||||
self.busy = 0
|
||||
@@ -882,573 +558,6 @@ class ftpwrapper:
|
||||
# Pass back both a suitably decorated object and a retrieval length
|
||||
return (addclosehook(conn[0].makefile('rb'),
|
||||
self.endtransfer), conn[1])
|
||||
def endtransfer(self):
|
||||
if not self.busy:
|
||||
return
|
||||
self.busy = 0
|
||||
try:
|
||||
self.ftp.voidresp()
|
||||
except ftperrors():
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
self.endtransfer()
|
||||
try:
|
||||
self.ftp.close()
|
||||
except ftperrors():
|
||||
pass
|
||||
|
||||
class addbase:
|
||||
"""Base class for addinfo and addclosehook."""
|
||||
|
||||
def __init__(self, fp):
|
||||
self.fp = fp
|
||||
self.read = self.fp.read
|
||||
self.readline = self.fp.readline
|
||||
if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
|
||||
if hasattr(self.fp, "fileno"):
|
||||
self.fileno = self.fp.fileno
|
||||
else:
|
||||
self.fileno = lambda: None
|
||||
if hasattr(self.fp, "__iter__"):
|
||||
self.__iter__ = self.fp.__iter__
|
||||
if hasattr(self.fp, "next"):
|
||||
self.next = self.fp.next
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
|
||||
id(self), self.fp)
|
||||
|
||||
def close(self):
|
||||
self.read = None
|
||||
self.readline = None
|
||||
self.readlines = None
|
||||
self.fileno = None
|
||||
if self.fp: self.fp.close()
|
||||
self.fp = None
|
||||
|
||||
class addclosehook(addbase):
|
||||
"""Class to add a close hook to an open file."""
|
||||
|
||||
def __init__(self, fp, closehook, *hookargs):
|
||||
addbase.__init__(self, fp)
|
||||
self.closehook = closehook
|
||||
self.hookargs = hookargs
|
||||
|
||||
def close(self):
|
||||
addbase.close(self)
|
||||
if self.closehook:
|
||||
self.closehook(*self.hookargs)
|
||||
self.closehook = None
|
||||
self.hookargs = None
|
||||
|
||||
class addinfo(addbase):
|
||||
"""class to add an info() method to an open file."""
|
||||
|
||||
def __init__(self, fp, headers):
|
||||
addbase.__init__(self, fp)
|
||||
self.headers = headers
|
||||
|
||||
def info(self):
|
||||
return self.headers
|
||||
|
||||
class addinfourl(addbase):
|
||||
"""class to add info() and geturl() methods to an open file."""
|
||||
|
||||
def __init__(self, fp, headers, url):
|
||||
addbase.__init__(self, fp)
|
||||
self.headers = headers
|
||||
self.url = url
|
||||
|
||||
def info(self):
|
||||
return self.headers
|
||||
|
||||
def geturl(self):
|
||||
return self.url
|
||||
|
||||
|
||||
# Utilities to parse URLs (most of these return None for missing parts):
|
||||
# unwrap('<URL:type://host/path>') --> 'type://host/path'
|
||||
# splittype('type:opaquestring') --> 'type', 'opaquestring'
|
||||
# splithost('//host[:port]/path') --> 'host[:port]', '/path'
|
||||
# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
|
||||
# splitpasswd('user:passwd') -> 'user', 'passwd'
|
||||
# splitport('host:port') --> 'host', 'port'
|
||||
# splitquery('/path?query') --> '/path', 'query'
|
||||
# splittag('/path#tag') --> '/path', 'tag'
|
||||
# splitattr('/path;attr1=value1;attr2=value2;...') ->
|
||||
# '/path', ['attr1=value1', 'attr2=value2', ...]
|
||||
# splitvalue('attr=value') --> 'attr', 'value'
|
||||
# splitgophertype('/Xselector') --> 'X', 'selector'
|
||||
# unquote('abc%20def') -> 'abc def'
|
||||
# quote('abc def') -> 'abc%20def')
|
||||
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
def _is_unicode(x):
|
||||
return 0
|
||||
else:
|
||||
def _is_unicode(x):
|
||||
return isinstance(x, unicode)
|
||||
|
||||
def toBytes(url):
|
||||
"""toBytes(u"URL") --> 'URL'."""
|
||||
# Most URL schemes require ASCII. If that changes, the conversion
|
||||
# can be relaxed
|
||||
if _is_unicode(url):
|
||||
try:
|
||||
url = url.encode("ASCII")
|
||||
except UnicodeError:
|
||||
raise UnicodeError("URL " + repr(url) +
|
||||
" contains non-ASCII characters")
|
||||
return url
|
||||
|
||||
def unwrap(url):
|
||||
"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
|
||||
url = url.strip()
|
||||
if url[:1] == '<' and url[-1:] == '>':
|
||||
url = url[1:-1].strip()
|
||||
if url[:4] == 'URL:': url = url[4:].strip()
|
||||
return url
|
||||
|
||||
_typeprog = None
|
||||
def splittype(url):
|
||||
"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
|
||||
global _typeprog
|
||||
if _typeprog is None:
|
||||
import re
|
||||
_typeprog = re.compile('^([^/:]+):')
|
||||
|
||||
match = _typeprog.match(url)
|
||||
if match:
|
||||
scheme = match.group(1)
|
||||
return scheme.lower(), url[len(scheme) + 1:]
|
||||
return None, url
|
||||
|
||||
_hostprog = None
|
||||
def splithost(url):
|
||||
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
|
||||
global _hostprog
|
||||
if _hostprog is None:
|
||||
import re
|
||||
_hostprog = re.compile('^//([^/?]*)(.*)$')
|
||||
|
||||
match = _hostprog.match(url)
|
||||
if match: return match.group(1, 2)
|
||||
return None, url
|
||||
|
||||
_userprog = None
|
||||
def splituser(host):
|
||||
"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
|
||||
global _userprog
|
||||
if _userprog is None:
|
||||
import re
|
||||
_userprog = re.compile('^(.*)@(.*)$')
|
||||
|
||||
match = _userprog.match(host)
|
||||
if match: return map(unquote, match.group(1, 2))
|
||||
return None, host
|
||||
|
||||
_passwdprog = None
|
||||
def splitpasswd(user):
|
||||
"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
|
||||
global _passwdprog
|
||||
if _passwdprog is None:
|
||||
import re
|
||||
_passwdprog = re.compile('^([^:]*):(.*)$')
|
||||
|
||||
match = _passwdprog.match(user)
|
||||
if match: return match.group(1, 2)
|
||||
return user, None
|
||||
|
||||
# splittag('/path#tag') --> '/path', 'tag'
|
||||
_portprog = None
|
||||
def splitport(host):
|
||||
"""splitport('host:port') --> 'host', 'port'."""
|
||||
global _portprog
|
||||
if _portprog is None:
|
||||
import re
|
||||
_portprog = re.compile('^(.*):([0-9]+)$')
|
||||
|
||||
match = _portprog.match(host)
|
||||
if match: return match.group(1, 2)
|
||||
return host, None
|
||||
|
||||
_nportprog = None
|
||||
def splitnport(host, defport=-1):
|
||||
"""Split host and port, returning numeric port.
|
||||
Return given default port if no ':' found; defaults to -1.
|
||||
Return numerical port if a valid number are found after ':'.
|
||||
Return None if ':' but not a valid number."""
|
||||
global _nportprog
|
||||
if _nportprog is None:
|
||||
import re
|
||||
_nportprog = re.compile('^(.*):(.*)$')
|
||||
|
||||
match = _nportprog.match(host)
|
||||
if match:
|
||||
host, port = match.group(1, 2)
|
||||
try:
|
||||
if not port: raise ValueError, "no digits"
|
||||
nport = int(port)
|
||||
except ValueError:
|
||||
nport = None
|
||||
return host, nport
|
||||
return host, defport
|
||||
|
||||
_queryprog = None
|
||||
def splitquery(url):
|
||||
"""splitquery('/path?query') --> '/path', 'query'."""
|
||||
global _queryprog
|
||||
if _queryprog is None:
|
||||
import re
|
||||
_queryprog = re.compile('^(.*)\?([^?]*)$')
|
||||
|
||||
match = _queryprog.match(url)
|
||||
if match: return match.group(1, 2)
|
||||
return url, None
|
||||
|
||||
_tagprog = None
|
||||
def splittag(url):
|
||||
"""splittag('/path#tag') --> '/path', 'tag'."""
|
||||
global _tagprog
|
||||
if _tagprog is None:
|
||||
import re
|
||||
_tagprog = re.compile('^(.*)#([^#]*)$')
|
||||
|
||||
match = _tagprog.match(url)
|
||||
if match: return match.group(1, 2)
|
||||
return url, None
|
||||
|
||||
def splitattr(url):
|
||||
"""splitattr('/path;attr1=value1;attr2=value2;...') ->
|
||||
'/path', ['attr1=value1', 'attr2=value2', ...]."""
|
||||
words = url.split(';')
|
||||
return words[0], words[1:]
|
||||
|
||||
_valueprog = None
|
||||
def splitvalue(attr):
|
||||
"""splitvalue('attr=value') --> 'attr', 'value'."""
|
||||
global _valueprog
|
||||
if _valueprog is None:
|
||||
import re
|
||||
_valueprog = re.compile('^([^=]*)=(.*)$')
|
||||
|
||||
match = _valueprog.match(attr)
|
||||
if match: return match.group(1, 2)
|
||||
return attr, None
|
||||
|
||||
def splitgophertype(selector):
|
||||
"""splitgophertype('/Xselector') --> 'X', 'selector'."""
|
||||
if selector[:1] == '/' and selector[1:2]:
|
||||
return selector[1], selector[2:]
|
||||
return None, selector
|
||||
|
||||
_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
|
||||
_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
|
||||
|
||||
def unquote(s):
|
||||
"""unquote('abc%20def') -> 'abc def'."""
|
||||
res = s.split('%')
|
||||
for i in xrange(1, len(res)):
|
||||
item = res[i]
|
||||
try:
|
||||
res[i] = _hextochr[item[:2]] + item[2:]
|
||||
except KeyError:
|
||||
res[i] = '%' + item
|
||||
except UnicodeDecodeError:
|
||||
res[i] = unichr(int(item[:2], 16)) + item[2:]
|
||||
return "".join(res)
|
||||
|
||||
def unquote_plus(s):
|
||||
"""unquote('%7e/abc+def') -> '~/abc def'"""
|
||||
s = s.replace('+', ' ')
|
||||
return unquote(s)
|
||||
|
||||
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
'abcdefghijklmnopqrstuvwxyz'
|
||||
'0123456789' '_.-')
|
||||
_safemaps = {}
|
||||
|
||||
def quote(s, safe = '/'):
|
||||
"""quote('abc def') -> 'abc%20def'
|
||||
|
||||
Each part of a URL, e.g. the path info, the query, etc., has a
|
||||
different set of reserved characters that must be quoted.
|
||||
|
||||
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
|
||||
the following reserved characters.
|
||||
|
||||
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||||
"$" | ","
|
||||
|
||||
Each of these characters is reserved in some component of a URL,
|
||||
but not necessarily in all of them.
|
||||
|
||||
By default, the quote function is intended for quoting the path
|
||||
section of a URL. Thus, it will not encode '/'. This character
|
||||
is reserved, but in typical usage the quote function is being
|
||||
called on a path where the existing slash characters are used as
|
||||
reserved characters.
|
||||
"""
|
||||
cachekey = (safe, always_safe)
|
||||
try:
|
||||
safe_map = _safemaps[cachekey]
|
||||
except KeyError:
|
||||
safe += always_safe
|
||||
safe_map = {}
|
||||
for i in range(256):
|
||||
c = chr(i)
|
||||
safe_map[c] = (c in safe) and c or ('%%%02X' % i)
|
||||
_safemaps[cachekey] = safe_map
|
||||
res = map(safe_map.__getitem__, s)
|
||||
return ''.join(res)
|
||||
|
||||
def quote_plus(s, safe = ''):
|
||||
"""Quote the query fragment of a URL; replacing ' ' with '+'"""
|
||||
if ' ' in s:
|
||||
s = quote(s, safe + ' ')
|
||||
return s.replace(' ', '+')
|
||||
return quote(s, safe)
|
||||
|
||||
def urlencode(query,doseq=0):
|
||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
||||
|
||||
If any values in the query arg are sequences and doseq is true, each
|
||||
sequence element is converted to a separate parameter.
|
||||
|
||||
If the query arg is a sequence of two-element tuples, the order of the
|
||||
parameters in the output will match the order of parameters in the
|
||||
input.
|
||||
"""
|
||||
|
||||
if hasattr(query,"items"):
|
||||
# mapping objects
|
||||
query = query.items()
|
||||
else:
|
||||
# it's a bother at times that strings and string-like objects are
|
||||
# sequences...
|
||||
try:
|
||||
# non-sequence items should not work with len()
|
||||
# non-empty strings will fail this
|
||||
if len(query) and not isinstance(query[0], tuple):
|
||||
raise TypeError
|
||||
# zero-length sequences of all types will get here and succeed,
|
||||
# but that's a minor nit - since the original implementation
|
||||
# allowed empty dicts that type of behavior probably should be
|
||||
# preserved for consistency
|
||||
except TypeError:
|
||||
ty,va,tb = sys.exc_info()
|
||||
raise TypeError, "not a valid non-string sequence or mapping object", tb
|
||||
|
||||
l = []
|
||||
if not doseq:
|
||||
# preserve old behavior
|
||||
for k, v in query:
|
||||
k = quote_plus(str(k))
|
||||
v = quote_plus(str(v))
|
||||
l.append(k + '=' + v)
|
||||
else:
|
||||
for k, v in query:
|
||||
k = quote_plus(str(k))
|
||||
if isinstance(v, str):
|
||||
v = quote_plus(v)
|
||||
l.append(k + '=' + v)
|
||||
elif _is_unicode(v):
|
||||
# is there a reasonable way to convert to ASCII?
|
||||
# encode generates a string, but "replace" or "ignore"
|
||||
# lose information and "strict" can raise UnicodeError
|
||||
v = quote_plus(v.encode("ASCII","replace"))
|
||||
l.append(k + '=' + v)
|
||||
else:
|
||||
try:
|
||||
# is this a sufficient test for sequence-ness?
|
||||
x = len(v)
|
||||
except TypeError:
|
||||
# not a sequence
|
||||
v = quote_plus(str(v))
|
||||
l.append(k + '=' + v)
|
||||
else:
|
||||
# loop over the sequence
|
||||
for elt in v:
|
||||
l.append(k + '=' + quote_plus(str(elt)))
|
||||
return '&'.join(l)
|
||||
|
||||
# Proxy handling
|
||||
def getproxies_environment():
|
||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
||||
|
||||
Scan the environment for variables named <scheme>_proxy;
|
||||
this seems to be the standard convention. If you need a
|
||||
different way, you can pass a proxies dictionary to the
|
||||
[Fancy]URLopener constructor.
|
||||
|
||||
"""
|
||||
proxies = {}
|
||||
for name, value in os.environ.items():
|
||||
name = name.lower()
|
||||
if value and name[-6:] == '_proxy':
|
||||
proxies[name[:-6]] = value
|
||||
return proxies
|
||||
|
||||
if sys.platform == 'darwin':
|
||||
def getproxies_internetconfig():
|
||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
||||
|
||||
By convention the mac uses Internet Config to store
|
||||
proxies. An HTTP proxy, for instance, is stored under
|
||||
the HttpProxy key.
|
||||
|
||||
"""
|
||||
try:
|
||||
import ic
|
||||
except ImportError:
|
||||
return {}
|
||||
|
||||
try:
|
||||
config = ic.IC()
|
||||
except ic.error:
|
||||
return {}
|
||||
proxies = {}
|
||||
# HTTP:
|
||||
if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
|
||||
try:
|
||||
value = config['HTTPProxyHost']
|
||||
except ic.error:
|
||||
pass
|
||||
else:
|
||||
proxies['http'] = 'http://%s' % value
|
||||
# FTP: XXXX To be done.
|
||||
# Gopher: XXXX To be done.
|
||||
return proxies
|
||||
|
||||
def proxy_bypass(x):
|
||||
return 0
|
||||
|
||||
def getproxies():
|
||||
return getproxies_environment() or getproxies_internetconfig()
|
||||
|
||||
elif os.name == 'nt':
|
||||
def getproxies_registry():
|
||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
||||
|
||||
Win32 uses the registry to store proxies.
|
||||
|
||||
"""
|
||||
proxies = {}
|
||||
try:
|
||||
import _winreg
|
||||
except ImportError:
|
||||
# Std module, so should be around - but you never know!
|
||||
return proxies
|
||||
try:
|
||||
internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
|
||||
r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
|
||||
proxyEnable = _winreg.QueryValueEx(internetSettings,
|
||||
'ProxyEnable')[0]
|
||||
if proxyEnable:
|
||||
# Returned as Unicode but problems if not converted to ASCII
|
||||
proxyServer = str(_winreg.QueryValueEx(internetSettings,
|
||||
'ProxyServer')[0])
|
||||
if '=' in proxyServer:
|
||||
# Per-protocol settings
|
||||
for p in proxyServer.split(';'):
|
||||
protocol, address = p.split('=', 1)
|
||||
# See if address has a type:// prefix
|
||||
import re
|
||||
if not re.match('^([^/:]+)://', address):
|
||||
address = '%s://%s' % (protocol, address)
|
||||
proxies[protocol] = address
|
||||
else:
|
||||
# Use one setting for all protocols
|
||||
if proxyServer[:5] == 'http:':
|
||||
proxies['http'] = proxyServer
|
||||
else:
|
||||
proxies['http'] = 'http://%s' % proxyServer
|
||||
proxies['ftp'] = 'ftp://%s' % proxyServer
|
||||
internetSettings.Close()
|
||||
except (WindowsError, ValueError, TypeError):
|
||||
# Either registry key not found etc, or the value in an
|
||||
# unexpected format.
|
||||
# proxies already set up to be empty so nothing to do
|
||||
pass
|
||||
return proxies
|
||||
|
||||
def getproxies():
|
||||
"""Return a dictionary of scheme -> proxy server URL mappings.
|
||||
|
||||
Returns settings gathered from the environment, if specified,
|
||||
or the registry.
|
||||
|
||||
"""
|
||||
return getproxies_environment() or getproxies_registry()
|
||||
|
||||
def proxy_bypass(host):
|
||||
try:
|
||||
import _winreg
|
||||
import re
|
||||
except ImportError:
|
||||
# Std modules, so should be around - but you never know!
|
||||
return 0
|
||||
try:
|
||||
internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
|
||||
r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
|
||||
proxyEnable = _winreg.QueryValueEx(internetSettings,
|
||||
'ProxyEnable')[0]
|
||||
proxyOverride = str(_winreg.QueryValueEx(internetSettings,
|
||||
'ProxyOverride')[0])
|
||||
# ^^^^ Returned as Unicode but problems if not converted to ASCII
|
||||
except WindowsError:
|
||||
return 0
|
||||
if not proxyEnable or not proxyOverride:
|
||||
return 0
|
||||
# try to make a host list from name and IP address.
|
||||
rawHost, port = splitport(host)
|
||||
host = [rawHost]
|
||||
try:
|
||||
addr = socket.gethostbyname(rawHost)
|
||||
if addr != rawHost:
|
||||
host.append(addr)
|
||||
except socket.error:
|
||||
pass
|
||||
try:
|
||||
fqdn = socket.getfqdn(rawHost)
|
||||
if fqdn != rawHost:
|
||||
host.append(fqdn)
|
||||
except socket.error:
|
||||
pass
|
||||
# make a check value list from the registry entry: replace the
|
||||
# '<local>' string by the localhost entry and the corresponding
|
||||
# canonical entry.
|
||||
proxyOverride = proxyOverride.split(';')
|
||||
i = 0
|
||||
while i < len(proxyOverride):
|
||||
if proxyOverride[i] == '<local>':
|
||||
proxyOverride[i:i+1] = ['localhost',
|
||||
'127.0.0.1',
|
||||
socket.gethostname(),
|
||||
socket.gethostbyname(
|
||||
socket.gethostname())]
|
||||
i += 1
|
||||
# print proxyOverride
|
||||
# now check if we match one of the registry values.
|
||||
for test in proxyOverride:
|
||||
test = test.replace(".", r"\.") # mask dots
|
||||
test = test.replace("*", r".*") # change glob sequence
|
||||
test = test.replace("?", r".") # change glob char
|
||||
for val in host:
|
||||
# print "%s <--> %s" %( test, val )
|
||||
if re.match(test, val, re.I):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
else:
|
||||
# By default use environment variables
|
||||
getproxies = getproxies_environment
|
||||
|
||||
def proxy_bypass(host):
|
||||
return 0
|
||||
|
||||
# Test and time quote() and unquote()
|
||||
def test1():
|
||||
|
Reference in New Issue
Block a user