Use requests lib instead of low-level urllib calls

With requests library users can run stackalytics-processor from
environments located behind HTTP/HTTPS proxy. Proxy address is
configured by setting the environment variables HTTP_PROXY or
HTTPS_PROXY.

Closes-Bug: #1351136

Change-Id: I6a65afb0f99b351dc2183294d9127cbbebc35856
This commit is contained in:
Ilya Shakhat 2015-09-22 15:37:07 +03:00
parent 43f38cd4f8
commit 5a8bc5649e
4 changed files with 45 additions and 38 deletions

View File

@ -16,5 +16,7 @@ psutil<2.0.0,>=1.1.1
PyGithub
python-memcached>=1.56
PyYAML>=3.1.0
requests>=2.5.2
requests-file
sh
six>=1.9.0

View File

@ -15,8 +15,6 @@
from oslo_log import log as logging
import six
from six.moves import http_client
from six.moves.urllib import parse
from stackalytics.processor import utils
@ -54,14 +52,11 @@ def lp_profile_by_email(email):
def lp_module_exists(module):
uri = LP_URI_DEVEL % module
parsed_uri = parse.urlparse(uri)
conn = http_client.HTTPConnection(parsed_uri.netloc)
conn.request('GET', parsed_uri.path)
res = conn.getresponse()
request = utils.do_request(uri)
LOG.debug('Checked uri: %(uri)s, status: %(status)s',
{'uri': uri, 'status': res.status})
conn.close()
return res.status != 404
{'uri': uri, 'status': request.status_code})
return request.status_code != 404
def lp_blueprint_generator(module):

View File

@ -18,7 +18,6 @@ import re
from oslo_log import log as logging
import six
from six.moves import http_client
from six.moves.urllib import parse
from stackalytics.processor import utils
@ -60,31 +59,25 @@ def _get_mail_archive_links(uri):
return [parse.urljoin(uri, link) for link in links]
def _link_content_changed(link, runtime_storage_inst):
LOG.debug('Check changes for mail archive located at uri: %s', link)
parsed_uri = parse.urlparse(link)
conn = http_client.HTTPConnection(parsed_uri.netloc)
conn.request('HEAD', parsed_uri.path)
res = conn.getresponse()
last_modified = res.getheader('last-modified')
conn.close()
def _uri_content_changed(uri, runtime_storage_inst):
LOG.debug('Check changes for mail archive located at: %s', uri)
last_modified = utils.get_uri_last_modified(uri)
if last_modified != runtime_storage_inst.get_by_key('mail_link:' + link):
if last_modified != runtime_storage_inst.get_by_key('mail_link:' + uri):
LOG.debug('Mail archive changed, last modified at: %s', last_modified)
runtime_storage_inst.set_by_key('mail_link:' + link, last_modified)
runtime_storage_inst.set_by_key('mail_link:' + uri, last_modified)
return True
return False
def _retrieve_mails(uri):
LOG.debug('Retrieving mail archive from uri: %s', uri)
content = utils.read_uri(uri)
LOG.debug('Retrieving mail archive from: %s', uri)
content = utils.read_gzip_from_uri(uri)
if not content:
LOG.error('Error reading mail archive from uri: %s', uri)
LOG.error('Error reading mail archive from: %s', uri)
return
content = utils.gzip_decompress(content)
LOG.debug('Mail archive is loaded, start processing')
content += TRAILING_RECORD
@ -116,7 +109,7 @@ def log(uri, runtime_storage_inst):
links = _get_mail_archive_links(uri)
for link in links:
if _link_content_changed(link, runtime_storage_inst):
if _uri_content_changed(link, runtime_storage_inst):
for mail in _retrieve_mails(link):
LOG.debug('New mail: %s', mail['message_id'])
yield mail

View File

@ -17,8 +17,6 @@ import calendar
import cgi
import datetime
import gzip
import io
import json
import random
import re
import time
@ -26,6 +24,8 @@ import time
import iso8601
from oslo_config import cfg
from oslo_log import log as logging
import requests
import requests_file
import six
@ -117,30 +117,31 @@ user_agents = [
]
def do_request(uri, method='get'):
with requests.Session() as session:
session.mount('file://', requests_file.FileAdapter())
user_agent = random.choice(user_agents)
return session.request(method, uri, headers={'User-Agent': user_agent})
def read_uri(uri):
try:
req = six.moves.urllib.request.Request(
url=uri, headers={'User-Agent': random.choice(user_agents)})
fd = six.moves.urllib.request.urlopen(req)
if six.PY3:
fd = io.TextIOWrapper(fd)
raw = fd.read()
fd.close()
return raw
return do_request(uri).text
except Exception as e:
LOG.warn('Error "%(error)s" while reading uri %(uri)s',
LOG.warn('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def read_json_from_uri(uri):
try:
return json.loads(read_uri(uri))
return do_request(uri).json()
except Exception as e:
LOG.warn('Error "%(error)s" parsing json from uri %(uri)s',
{'error': e, 'uri': uri})
def gzip_decompress(content):
def _gzip_decompress(content):
if six.PY3:
return gzip.decompress(content).decode('utf8')
else:
@ -148,6 +149,22 @@ def gzip_decompress(content):
return gzip_fd.read()
def read_gzip_from_uri(uri):
try:
return _gzip_decompress(do_request(uri).content)
except Exception as e:
LOG.warn('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def get_uri_last_modified(uri):
try:
return do_request(uri, method='head').headers['last-modified']
except Exception as e:
LOG.warn('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def cmp_to_key(mycmp): # ported from python 3
"""Convert a cmp= function into a key= function."""
class K(object):