Added ability to read emails from *.txt archives

Change-Id: I34095519497f1a0c85b6d640d5457af7ea4df9f9
This commit is contained in:
Sergey Nikitin 2019-08-11 14:17:01 +04:00
parent 8cf7cd0950
commit 639dc820ce
5 changed files with 21 additions and 5 deletions

View File

@ -204,6 +204,7 @@ def _process_repo(repo, runtime_storage_inst, record_processor_inst):
def _process_mail_list(uri, runtime_storage_inst, record_processor_inst): def _process_mail_list(uri, runtime_storage_inst, record_processor_inst):
LOG.info("Processing mail list %s" % uri)
mail_iterator = mls.log(uri, runtime_storage_inst) mail_iterator = mls.log(uri, runtime_storage_inst)
mail_iterator_typed = _record_typer(mail_iterator, 'email') mail_iterator_typed = _record_typer(mail_iterator, 'email')
processed_mail_iterator = record_processor_inst.process( processed_mail_iterator = record_processor_inst.process(

View File

@ -58,8 +58,8 @@ def _get_mail_archive_links(uri):
LOG.warning('Mail archive list is not found at %s', uri) LOG.warning('Mail archive list is not found at %s', uri)
return [] return []
links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt\.gz)', content, links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(?:\.gz)?)',
flags=re.IGNORECASE)) content, flags=re.IGNORECASE))
return [parse.urljoin(uri, link) for link in links] return [parse.urljoin(uri, link) for link in links]
@ -91,7 +91,12 @@ def _optimize_body(email_body):
def _retrieve_mails(uri): def _retrieve_mails(uri):
LOG.debug('Retrieving mail archive from: %s', uri) LOG.debug('Retrieving mail archive from: %s', uri)
if uri.endswith('.gz'):
content = utils.read_gzip_from_uri(uri) content = utils.read_gzip_from_uri(uri)
else:
content = utils.read_txt_from_uri(uri)
if not content: if not content:
LOG.error('Error reading mail archive from: %s', uri) LOG.error('Error reading mail archive from: %s', uri)
return return
@ -129,6 +134,7 @@ def log(uri, runtime_storage_inst):
links = _get_mail_archive_links(uri) links = _get_mail_archive_links(uri)
for link in links: for link in links:
LOG.info("Processing emails from %s" % link)
if _uri_content_changed(link, runtime_storage_inst): if _uri_content_changed(link, runtime_storage_inst):
for mail in _retrieve_mails(link): for mail in _retrieve_mails(link):
LOG.debug('New mail: %s', mail['message_id']) LOG.debug('New mail: %s', mail['message_id'])

View File

@ -350,7 +350,8 @@ class RecordProcessor(object):
modules, alias_module_map = self._get_modules() modules, alias_module_map = self._get_modules()
for module in modules: for module in modules:
find = subject.find(module) find = subject.find(module)
if (find >= 0) and (find < pos): if (find >= 0) and (find < pos) \
and (len(module) > len(best_guess_module or '')):
pos = find pos = find
best_guess_module = module best_guess_module = module

View File

@ -167,6 +167,14 @@ def _gzip_decompress(content):
return gzip_fd.read() return gzip_fd.read()
def read_txt_from_uri(uri):
try:
return do_request(uri).content.decode('utf8')
except Exception as e:
LOG.warning('Error "%(error)s" retrieving uri %(uri)s',
{'error': e, 'uri': uri})
def read_gzip_from_uri(uri): def read_gzip_from_uri(uri):
try: try:
return _gzip_decompress(do_request(uri).content) return _gzip_decompress(do_request(uri).content)

View File

@ -82,7 +82,7 @@ From: sorlando at nicira.com (Salvatore Orlando)
def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links, def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links,
mock_read_gzip_from_uri): mock_read_gzip_from_uri):
mock_uri_content_changed.return_value = True mock_uri_content_changed.return_value = True
mock_get_mail_archive_links.return_value = ['link'] mock_get_mail_archive_links.return_value = ['link.txt.gz']
mock_read_gzip_from_uri.return_value = EMAIL_CONTENT mock_read_gzip_from_uri.return_value = EMAIL_CONTENT
mock_rsi = mock.Mock() mock_rsi = mock.Mock()