Added ability to read emails from *.txt archives
Change-Id: I34095519497f1a0c85b6d640d5457af7ea4df9f9
This commit is contained in:
parent
8cf7cd0950
commit
639dc820ce
@ -204,6 +204,7 @@ def _process_repo(repo, runtime_storage_inst, record_processor_inst):
|
||||
|
||||
|
||||
def _process_mail_list(uri, runtime_storage_inst, record_processor_inst):
|
||||
LOG.info("Processing mail list %s" % uri)
|
||||
mail_iterator = mls.log(uri, runtime_storage_inst)
|
||||
mail_iterator_typed = _record_typer(mail_iterator, 'email')
|
||||
processed_mail_iterator = record_processor_inst.process(
|
||||
|
@ -58,8 +58,8 @@ def _get_mail_archive_links(uri):
|
||||
LOG.warning('Mail archive list is not found at %s', uri)
|
||||
return []
|
||||
|
||||
links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt\.gz)', content,
|
||||
flags=re.IGNORECASE))
|
||||
links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(?:\.gz)?)',
|
||||
content, flags=re.IGNORECASE))
|
||||
return [parse.urljoin(uri, link) for link in links]
|
||||
|
||||
|
||||
@ -91,7 +91,12 @@ def _optimize_body(email_body):
|
||||
|
||||
def _retrieve_mails(uri):
|
||||
LOG.debug('Retrieving mail archive from: %s', uri)
|
||||
|
||||
if uri.endswith('.gz'):
|
||||
content = utils.read_gzip_from_uri(uri)
|
||||
else:
|
||||
content = utils.read_txt_from_uri(uri)
|
||||
|
||||
if not content:
|
||||
LOG.error('Error reading mail archive from: %s', uri)
|
||||
return
|
||||
@ -129,6 +134,7 @@ def log(uri, runtime_storage_inst):
|
||||
|
||||
links = _get_mail_archive_links(uri)
|
||||
for link in links:
|
||||
LOG.info("Processing emails from %s" % link)
|
||||
if _uri_content_changed(link, runtime_storage_inst):
|
||||
for mail in _retrieve_mails(link):
|
||||
LOG.debug('New mail: %s', mail['message_id'])
|
||||
|
@ -350,7 +350,8 @@ class RecordProcessor(object):
|
||||
modules, alias_module_map = self._get_modules()
|
||||
for module in modules:
|
||||
find = subject.find(module)
|
||||
if (find >= 0) and (find < pos):
|
||||
if (find >= 0) and (find < pos) \
|
||||
and (len(module) > len(best_guess_module or '')):
|
||||
pos = find
|
||||
best_guess_module = module
|
||||
|
||||
|
@ -167,6 +167,14 @@ def _gzip_decompress(content):
|
||||
return gzip_fd.read()
|
||||
|
||||
|
||||
def read_txt_from_uri(uri):
|
||||
try:
|
||||
return do_request(uri).content.decode('utf8')
|
||||
except Exception as e:
|
||||
LOG.warning('Error "%(error)s" retrieving uri %(uri)s',
|
||||
{'error': e, 'uri': uri})
|
||||
|
||||
|
||||
def read_gzip_from_uri(uri):
|
||||
try:
|
||||
return _gzip_decompress(do_request(uri).content)
|
||||
|
@ -82,7 +82,7 @@ From: sorlando at nicira.com (Salvatore Orlando)
|
||||
def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links,
|
||||
mock_read_gzip_from_uri):
|
||||
mock_uri_content_changed.return_value = True
|
||||
mock_get_mail_archive_links.return_value = ['link']
|
||||
mock_get_mail_archive_links.return_value = ['link.txt.gz']
|
||||
mock_read_gzip_from_uri.return_value = EMAIL_CONTENT
|
||||
mock_rsi = mock.Mock()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user