From 5a8bc5649ee47232998d2b5cdf2da20df5eaaf80 Mon Sep 17 00:00:00 2001 From: Ilya Shakhat Date: Tue, 22 Sep 2015 15:37:07 +0300 Subject: [PATCH] Use requests lib instead of low-level urllib calls With requests library users can run stackalytics-processor from environments located behind HTTP/HTTPS proxy. Proxy address is configured by setting the environment variables HTTP_PROXY or HTTPS_PROXY. Closes-Bug: #1351136 Change-Id: I6a65afb0f99b351dc2183294d9127cbbebc35856 --- requirements.txt | 2 ++ stackalytics/processor/launchpad_utils.py | 13 +++---- stackalytics/processor/mls.py | 25 +++++-------- stackalytics/processor/utils.py | 43 ++++++++++++++++------- 4 files changed, 45 insertions(+), 38 deletions(-) diff --git a/requirements.txt b/requirements.txt index e84333990..0cc7bf495 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,5 +16,7 @@ psutil<2.0.0,>=1.1.1 PyGithub python-memcached>=1.56 PyYAML>=3.1.0 +requests>=2.5.2 +requests-file sh six>=1.9.0 diff --git a/stackalytics/processor/launchpad_utils.py b/stackalytics/processor/launchpad_utils.py index b17608836..a07f6a08c 100644 --- a/stackalytics/processor/launchpad_utils.py +++ b/stackalytics/processor/launchpad_utils.py @@ -15,8 +15,6 @@ from oslo_log import log as logging import six -from six.moves import http_client -from six.moves.urllib import parse from stackalytics.processor import utils @@ -54,14 +52,11 @@ def lp_profile_by_email(email): def lp_module_exists(module): uri = LP_URI_DEVEL % module - parsed_uri = parse.urlparse(uri) - conn = http_client.HTTPConnection(parsed_uri.netloc) - conn.request('GET', parsed_uri.path) - res = conn.getresponse() + request = utils.do_request(uri) + LOG.debug('Checked uri: %(uri)s, status: %(status)s', - {'uri': uri, 'status': res.status}) - conn.close() - return res.status != 404 + {'uri': uri, 'status': request.status_code}) + return request.status_code != 404 def lp_blueprint_generator(module): diff --git a/stackalytics/processor/mls.py b/stackalytics/processor/mls.py index db4f367f8..191ad6fd6 100644 --- a/stackalytics/processor/mls.py +++ b/stackalytics/processor/mls.py @@ -18,7 +18,6 @@ import re from oslo_log import log as logging import six -from six.moves import http_client from six.moves.urllib import parse from stackalytics.processor import utils @@ -60,31 +59,25 @@ def _get_mail_archive_links(uri): return [parse.urljoin(uri, link) for link in links] -def _link_content_changed(link, runtime_storage_inst): - LOG.debug('Check changes for mail archive located at uri: %s', link) - parsed_uri = parse.urlparse(link) - conn = http_client.HTTPConnection(parsed_uri.netloc) - conn.request('HEAD', parsed_uri.path) - res = conn.getresponse() - last_modified = res.getheader('last-modified') - conn.close() +def _uri_content_changed(uri, runtime_storage_inst): + LOG.debug('Check changes for mail archive located at: %s', uri) + last_modified = utils.get_uri_last_modified(uri) - if last_modified != runtime_storage_inst.get_by_key('mail_link:' + link): + if last_modified != runtime_storage_inst.get_by_key('mail_link:' + uri): LOG.debug('Mail archive changed, last modified at: %s', last_modified) - runtime_storage_inst.set_by_key('mail_link:' + link, last_modified) + runtime_storage_inst.set_by_key('mail_link:' + uri, last_modified) return True return False def _retrieve_mails(uri): - LOG.debug('Retrieving mail archive from uri: %s', uri) - content = utils.read_uri(uri) + LOG.debug('Retrieving mail archive from: %s', uri) + content = utils.read_gzip_from_uri(uri) if not content: - LOG.error('Error reading mail archive from uri: %s', uri) + LOG.error('Error reading mail archive from: %s', uri) return - content = utils.gzip_decompress(content) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD @@ -116,7 +109,7 @@ def log(uri, runtime_storage_inst): links = _get_mail_archive_links(uri) for link in links: - if _link_content_changed(link, runtime_storage_inst): + if _uri_content_changed(link, runtime_storage_inst): for mail in _retrieve_mails(link): LOG.debug('New mail: %s', mail['message_id']) yield mail diff --git a/stackalytics/processor/utils.py b/stackalytics/processor/utils.py index 1a2315d71..ad5011797 100644 --- a/stackalytics/processor/utils.py +++ b/stackalytics/processor/utils.py @@ -17,8 +17,6 @@ import calendar import cgi import datetime import gzip -import io -import json import random import re import time @@ -26,6 +24,8 @@ import time import iso8601 from oslo_config import cfg from oslo_log import log as logging +import requests +import requests_file import six @@ -117,30 +117,31 @@ user_agents = [ ] +def do_request(uri, method='get'): + with requests.Session() as session: + session.mount('file://', requests_file.FileAdapter()) + user_agent = random.choice(user_agents) + + return session.request(method, uri, headers={'User-Agent': user_agent}) + + def read_uri(uri): try: - req = six.moves.urllib.request.Request( - url=uri, headers={'User-Agent': random.choice(user_agents)}) - fd = six.moves.urllib.request.urlopen(req) - if six.PY3: - fd = io.TextIOWrapper(fd) - raw = fd.read() - fd.close() - return raw + return do_request(uri).text except Exception as e: - LOG.warn('Error "%(error)s" while reading uri %(uri)s', + LOG.warn('Error "%(error)s" retrieving uri %(uri)s', {'error': e, 'uri': uri}) def read_json_from_uri(uri): try: - return json.loads(read_uri(uri)) + return do_request(uri).json() except Exception as e: LOG.warn('Error "%(error)s" parsing json from uri %(uri)s', {'error': e, 'uri': uri}) -def gzip_decompress(content): +def _gzip_decompress(content): if six.PY3: return gzip.decompress(content).decode('utf8') else: @@ -148,6 +149,22 @@ def gzip_decompress(content): return gzip_fd.read() +def read_gzip_from_uri(uri): + try: + return _gzip_decompress(do_request(uri).content) + except Exception as e: + LOG.warn('Error "%(error)s" retrieving uri %(uri)s', + {'error': e, 'uri': uri}) + + +def get_uri_last_modified(uri): + try: + return do_request(uri, method='head').headers['last-modified'] + except Exception as e: + LOG.warn('Error "%(error)s" retrieving uri %(uri)s', + {'error': e, 'uri': uri}) + + def cmp_to_key(mycmp): # ported from python 3 """Convert a cmp= function into a key= function.""" class K(object):