stackalytics/stackalytics/processor/utils.py

381 lines
10 KiB
Python

# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import calendar
import datetime
import gzip
import html
import random
import re
import time
import iso8601
from oslo_config import cfg
from oslo_log import log as logging
import requests
import requests_file
import six
import yaml
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
def init_config_and_logging(opts):
CONF.register_cli_opts(opts)
CONF.register_opts(opts)
logging.register_options(CONF)
logging.set_defaults()
CONF(project='stackalytics')
logging.setup(CONF, 'stackalytics')
LOG.info('Logging enabled')
CONF.log_opt_values(LOG, logging.DEBUG)
def date_to_timestamp(d):
if not d:
return 0
if d == 'now':
return int(time.time())
return int(time.mktime(
datetime.datetime.strptime(d, '%Y-%b-%d').timetuple()))
def date_to_timestamp_ext(d):
try:
return date_to_timestamp(d)
except (ValueError, TypeError):
return int(d)
def member_date_to_timestamp(d):
if not d:
return 0
return int(time.mktime(
datetime.datetime.strptime(d, '%B %d, %Y ').timetuple()))
def iso8601_to_timestamp(s):
return calendar.timegm(iso8601.parse_date(s).utctimetuple())
def timestamp_to_date(timestamp):
return (datetime.datetime.fromtimestamp(timestamp).
strftime('%Y-%b-%d'))
def timestamp_to_week(timestamp):
# Jan 4th 1970 is the first Sunday in the Epoch
return (timestamp - 3 * 24 * 3600) // (7 * 24 * 3600)
def week_to_date(week):
timestamp = week * 7 * 24 * 3600 + 3 * 24 * 3600
return (datetime.datetime.fromtimestamp(timestamp).
strftime('%Y-%m-%d %H:%M:%S'))
def timestamp_to_day(timestamp):
return timestamp // (24 * 3600)
def timestamp_to_utc_date(timestamp):
return (datetime.datetime.fromtimestamp(timestamp).
strftime('%Y-%m-%d'))
def round_timestamp_to_day(timestamp):
return (int(timestamp) // (24 * 3600)) * (24 * 3600)
def check_email_validity(email):
if email:
return re.match(r'[\w\d_\.-\\+]+@([\w\d_\.-]+\.)+[\w]+', email)
return False
user_agents = [
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) Gecko/20100101 Firefox/41.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9',
'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X) Chrome/45.0.2062.120',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
]
def _session_request(session, uri, method):
session.mount('file://', requests_file.FileAdapter())
user_agent = random.choice(user_agents)
headers = {'User-Agent': user_agent, 'Accept': 'application/json'}
return session.request(method, uri, headers=headers,
timeout=CONF.read_timeout)
def do_request(uri, method='get', session=None):
if session:
return _session_request(session, uri, method)
else:
with requests.Session() as session:
return _session_request(session, uri, method)
def read_uri(uri, session=None):
try:
return do_request(uri, session=session).text
except Exception as e:
LOG.warning('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def read_json_from_uri(uri, session=None):
try:
return do_request(uri, session=session).json()
except Exception as e:
LOG.warning('Error "%(error)s" parsing json from uri %(uri)s',
{'error': e, 'uri': uri})
def read_yaml_from_uri(uri):
try:
return yaml.safe_load(read_uri(uri))
except Exception as e:
LOG.warning('Error "%(error)s" parsing yaml from uri %(uri)s',
{'error': e, 'uri': uri})
def _gzip_decompress(content):
if six.PY3:
return gzip.decompress(content).decode('utf8')
else:
gzip_fd = gzip.GzipFile(fileobj=six.moves.StringIO(content))
return gzip_fd.read()
def read_txt_from_uri(uri):
try:
return do_request(uri).content.decode('utf8')
except Exception as e:
LOG.warning('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def read_gzip_from_uri(uri):
try:
return _gzip_decompress(do_request(uri).content)
except Exception as e:
LOG.warning('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def get_uri_last_modified(uri):
try:
return do_request(uri, method='head').headers['last-modified']
except Exception as e:
LOG.warning('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def cmp_to_key(mycmp): # ported from python 3
"""Convert a cmp= function into a key= function."""
class K(object):
__slots__ = ['obj']
def __init__(self, obj):
self.obj = obj
def __lt__(self, other):
return mycmp(self.obj, other.obj) < 0
def __gt__(self, other):
return mycmp(self.obj, other.obj) > 0
def __eq__(self, other):
return mycmp(self.obj, other.obj) == 0
def __le__(self, other):
return mycmp(self.obj, other.obj) <= 0
def __ge__(self, other):
return mycmp(self.obj, other.obj) >= 0
def __ne__(self, other):
return mycmp(self.obj, other.obj) != 0
__hash__ = None
return K
def make_range(start, stop, step):
last_full = stop - ((stop - start) % step)
for i in six.moves.range(start, last_full, step):
yield six.moves.range(i, i + step)
if stop > last_full:
yield six.moves.range(last_full, stop)
def load_repos(runtime_storage_inst):
return runtime_storage_inst.get_by_key('repos') or []
def unwrap_text(text):
res = ''
for line in text.splitlines():
s = line.rstrip()
if not s:
continue
res += line
if (not s[0].isalpha()) or (s[-1] in ['.', '!', '?', '>', ':', ';']):
res += '\n'
else:
res += ' '
return res.rstrip()
def format_text(s):
# TODO(snikitin) Maybe we need to remove 'False' from escape()
# to escape ' and " symbols?
s = html.escape(re.sub(re.compile('\n{2,}', flags=re.MULTILINE), '\n', s),
False)
def replace_dots(match_obj):
return re.sub(r'([\./]+)', r'\1&#8203;', match_obj.group(0))
s = re.sub(r'((?:\w+[\./]+)+\w+)', replace_dots, s)
return s
def make_age_string(seconds):
days = seconds / (3600 * 24)
hours = (seconds / 3600) - (days * 24)
return '%d days and %d hours' % (days, hours)
def merge_records(original, new):
need_update = False
for key, value in six.iteritems(new):
if original.get(key) != value:
need_update = True
original[key] = value
return need_update
def get_blueprint_id(module, name):
return module + ':' + name
def make_bug_id(bug_id, module, release=None):
if release:
return '/'.join([module, release, bug_id])
else:
return '/'.join([module, bug_id])
def get_patch_id(review_id, patch_number):
return '%s:%s' % (review_id, patch_number)
def add_index(sequence, start=1, item_filter=lambda x: True):
n = start
for item in sequence:
if item_filter(item):
item['index'] = n
n += 1
else:
item['index'] = ''
return sequence
def safe_encode(s):
return six.moves.urllib.parse.quote(s.encode('utf-8'))
def keep_safe_chars(s):
return re.sub(r'[^\x21-\x7e\x80-\xff]+', '', s)
def make_module_group(module_group_id, name=None, modules=None, tag='module'):
return {'id': module_group_id,
'module_group_name': name or module_group_id,
'modules': modules or {module_group_id},
'tag': tag}
BAD_NAME_SUFFIXES = ['Ltd', 'Pvt', 'Inc', 'GmbH', 'AG', 'Corporation', 'Corp',
'Company', 'Co', 'Group', 'Srl', 'Limited', 'LLC', 'IT']
BAD_NAME_SUFFIXES_WITH_STOPS = ['S.p.A.', 's.r.o.', 'L.P.', 'B.V.', 'K.K.',
'd.o.o.']
def normalize_company_name(name):
regex = '(\\b(' + '|'.join(BAD_NAME_SUFFIXES) + ')\\b)'
regex += '|' + '((^|\\s)(' + '|'.join(BAD_NAME_SUFFIXES_WITH_STOPS) + '))'
name = re.sub(re.compile(regex, re.IGNORECASE), '', name)
return ''.join([c.lower() for c in name if c.isalnum()])
def normalize_company_draft(name):
name = re.sub(',', ' ', name)
name = re.sub(r'\s+', ' ', name)
return name
def validate_lp_display_name(lp_profile):
if lp_profile:
if "<email address hidden>" == lp_profile['display_name']:
lp_profile['display_name'] = lp_profile['name']
def make_pipeline_processor(processors):
def get_passes(_processors):
# every processor yields one or more record handlers
# this function groups record handlers by pass and returns list of them
processor_generators = [p() for p in _processors]
work = True
while work:
work = False
record_handlers = []
for generator in processor_generators:
try:
record_handlers.append(next(generator))
except StopIteration:
pass
if record_handlers:
work = True
yield record_handlers
def pipeline_processor(record_generator):
# for every pass
for one_pass in get_passes(processors):
# iterate every record in producer
for record in record_generator():
# iterate over record handlers within single pass
for record_handler in one_pass:
# feed record to the handler
for r in record_handler(record) or []:
# yield processed record
yield r
return pipeline_processor