diff --git a/stackalytics/processor/main.py b/stackalytics/processor/main.py index 9074bb6ce..0f0c351da 100644 --- a/stackalytics/processor/main.py +++ b/stackalytics/processor/main.py @@ -204,6 +204,7 @@ def _process_repo(repo, runtime_storage_inst, record_processor_inst): def _process_mail_list(uri, runtime_storage_inst, record_processor_inst): + LOG.info("Processing mail list %s" % uri) mail_iterator = mls.log(uri, runtime_storage_inst) mail_iterator_typed = _record_typer(mail_iterator, 'email') processed_mail_iterator = record_processor_inst.process( diff --git a/stackalytics/processor/mls.py b/stackalytics/processor/mls.py index 54d13e432..63d481efc 100644 --- a/stackalytics/processor/mls.py +++ b/stackalytics/processor/mls.py @@ -58,8 +58,8 @@ def _get_mail_archive_links(uri): LOG.warning('Mail archive list is not found at %s', uri) return [] - links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt\.gz)', content, - flags=re.IGNORECASE)) + links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(?:\.gz)?)', + content, flags=re.IGNORECASE)) return [parse.urljoin(uri, link) for link in links] @@ -91,7 +91,12 @@ def _optimize_body(email_body): def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from: %s', uri) - content = utils.read_gzip_from_uri(uri) + + if uri.endswith('.gz'): + content = utils.read_gzip_from_uri(uri) + else: + content = utils.read_txt_from_uri(uri) + if not content: LOG.error('Error reading mail archive from: %s', uri) return @@ -129,6 +134,7 @@ def log(uri, runtime_storage_inst): links = _get_mail_archive_links(uri) for link in links: + LOG.info("Processing emails from %s" % link) if _uri_content_changed(link, runtime_storage_inst): for mail in _retrieve_mails(link): LOG.debug('New mail: %s', mail['message_id']) diff --git a/stackalytics/processor/record_processor.py b/stackalytics/processor/record_processor.py index 682785f95..2a43759b7 100644 --- a/stackalytics/processor/record_processor.py +++ b/stackalytics/processor/record_processor.py @@ -350,7 +350,8 @@ class RecordProcessor(object): modules, alias_module_map = self._get_modules() for module in modules: find = subject.find(module) - if (find >= 0) and (find < pos): + if (find >= 0) and (find < pos) \ + and (len(module) > len(best_guess_module or '')): pos = find best_guess_module = module diff --git a/stackalytics/processor/utils.py b/stackalytics/processor/utils.py index cf367ac42..21fd08a3f 100644 --- a/stackalytics/processor/utils.py +++ b/stackalytics/processor/utils.py @@ -167,6 +167,14 @@ def _gzip_decompress(content): return gzip_fd.read() +def read_txt_from_uri(uri): + try: + return do_request(uri).content.decode('utf8') + except Exception as e: + LOG.warning('Error "%(error)s" retrieving uri %(uri)s', + {'error': e, 'uri': uri}) + + def read_gzip_from_uri(uri): try: return _gzip_decompress(do_request(uri).content) diff --git a/stackalytics/tests/unit/test_mls.py b/stackalytics/tests/unit/test_mls.py index 3e6a2d49d..5d2488f6d 100644 --- a/stackalytics/tests/unit/test_mls.py +++ b/stackalytics/tests/unit/test_mls.py @@ -82,7 +82,7 @@ From: sorlando at nicira.com (Salvatore Orlando) def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links, mock_read_gzip_from_uri): mock_uri_content_changed.return_value = True - mock_get_mail_archive_links.return_value = ['link'] + mock_get_mail_archive_links.return_value = ['link.txt.gz'] mock_read_gzip_from_uri.return_value = EMAIL_CONTENT mock_rsi = mock.Mock()