Add standalone get_timestamp implementation

This change provides a simpler and faster solution to avoid
overloading the service.

Change-Id: Ifbfe1eb6cd2bb130ca34dbdedc0d247db9872353
This commit is contained in:
Tristan Cacqueray 2022-03-28 13:44:36 +00:00 committed by Daniel Pawlik
parent 557cda54d8
commit 5ee54f38a5
4 changed files with 52 additions and 73 deletions

View File

@ -34,6 +34,8 @@ continuous-integration environment, powered by `Zuul
Any changes to logscraper script or tests will trigger jobs to
thoroughly test those changes.
To run a single test: *tox -epy38 <test_name>*
Benchmarking
------------

View File

@ -21,13 +21,11 @@ The goal is to get content from build uuid directory and send to Opensearch
import argparse
import copy
import datefinder
import datetime
import itertools
import logging
import multiprocessing
import os
import parsedatetime
import re
import shutil
import sys
@ -189,51 +187,38 @@ def makeFields(build_inventory, buildinfo):
return fields
def alt_parse_timeformat(line):
"""Return isoformat datetime if all parsing fails"""
# The paredatetime lib is almost perfect converting timestamp, but
# in few logs it increse/decrease year or hour.
# Use it as a last hope.
p = parsedatetime.Calendar()
parse_time = p.parse(line)
if parse_time:
return datetime.datetime.fromtimestamp(
time.mktime(parse_time[0]))
timestamp_patterns = [
# 2022-03-25T17:40:37.220547Z
(re.compile(r"(\S+)"), "%Y-%m-%dT%H:%M:%S.%fZ"),
# 2022-02-28 09:44:58.839036
(re.compile(r"(\S+ \S+)"), "%Y-%m-%d %H:%M:%S.%f"),
# Mar 25 17:40:37 : TODO(tdecacqu): fix the log file format
# because guessing the YEAR is error prone
(re.compile(r"(\S+ \S+ \S+)"), "%b %d %H:%M:%S"),
# 2022-03-23T11:46:49+0000 - isoformat
(re.compile(r"([0-9-T:]{19})"), "%Y-%m-%dT%H:%M:%S"),
# Friday 25 February 2022 09:27:51 +0000 - ansible
(re.compile(r"(\S+ [0-9]{2} \S+ [0-9: ]{14})"), "%A %d %B %Y %H:%M:%S")
]
def parse_text_timeformat(line):
def try_timestamp(regex, fmt, line):
try:
# check it time is parsed like:
# Mar 16 18:41:47 fedora-host some info
timestamp_search = re.search(r'\w{3}\s[0-9]{2}\s[0-9.:]{8}',
line.split("|", 1)[0])
if timestamp_search:
return datetime.datetime.strptime(
timestamp_search.group(), '%b %d %H:%M:%S').replace(
year=datetime.date.today().year)
except TypeError:
return alt_parse_timeformat(line)
def find_time_in_line(line):
try:
text_timeformat = parse_text_timeformat(line)
if text_timeformat:
return text_timeformat
found_date = list(datefinder.find_dates(line))
if found_date:
return found_date[0]
except TypeError:
return alt_parse_timeformat(line)
if match := regex.match(line):
timestamp_string = match.groups()[0]
date = datetime.datetime.strptime(timestamp_string, fmt)
if date.year == 1900:
# Handle missing year
date = date.replace(year=datetime.date.today().year)
return date
except ValueError:
pass
def get_timestamp(line):
"""Parse log time format like and return in iso format"""
time = find_time_in_line(line)
if time and isinstance(time, datetime.datetime):
return time.isoformat(sep='T', timespec='milliseconds').replace(
'+00:00', '')
for (regex, fmt) in timestamp_patterns:
if res := try_timestamp(regex, fmt, line):
return res
def get_message(line):
@ -261,12 +246,15 @@ def logline_iter(build_file):
with open_file(build_file) as f:
while True:
line = f.readline()
if last_known_timestamp is None and line.startswith(
"-- Logs begin at "):
continue
if line:
ts = get_timestamp(line)
if ts:
last_known_timestamp = ts
elif not last_known_timestamp and not ts:
ts = datetime.datetime.utcnow().isoformat()
ts = datetime.datetime.utcnow()
else:
ts = last_known_timestamp
yield (ts, line)
@ -277,7 +265,7 @@ def logline_iter(build_file):
def doc_iter(inner, index, es_fields, doc_type, chunk_size):
for (ts, line) in inner:
fields = copy.deepcopy(es_fields)
fields["@timestamp"] = ts
fields["@timestamp"] = ts.isoformat()
message = get_message(line)
if not message:

View File

@ -545,9 +545,9 @@ class TestSender(base.TestCase):
@mock.patch('logscraper.logsender.logline_iter')
def test_doc_iter(self, mock_logline):
text = [('2022-02-28T09:39:09.596000',
text = [(datetime.datetime(2022, 2, 28, 9, 39, 9, 596000),
'2022-02-28 09:39:09.596010 | Job console starting...\n'),
('2022-02-28T09:39:09.610000',
(datetime.datetime(2022, 2, 28, 9, 39, 9, 610000),
'2022-02-28 09:39:09.610160 | Updating repositories\n')]
expected_chunk = [{
'_index': 'someindex',
@ -577,11 +577,11 @@ class TestSender(base.TestCase):
2022-02-28 09:39:09.996 | Preparing job workspace"""
expected_data = [
('2022-02-28T09:39:09.596',
(datetime.datetime(2022, 2, 28, 9, 39, 9, 596000),
'2022-02-28 09:39:09.596 | Job console starting...\n'),
('2022-02-28T09:39:09.610',
(datetime.datetime(2022, 2, 28, 9, 39, 9, 610000),
'2022-02-28 09:39:09.610 | Updating repositories\n'),
('2022-02-28T09:39:09.996',
(datetime.datetime(2022, 2, 28, 9, 39, 9, 996000),
'2022-02-28 09:39:09.996 | Preparing job workspace')
]
readed_data = mock.mock_open(read_data=text)
@ -608,29 +608,20 @@ class TestSender(base.TestCase):
logsender.get_message(line_2))
def test_get_timestamp(self):
line_1 = "28-02-2022 09:44:58.839036 | Some message"
line_2 = "2022-02-28 09:44:58.839036 | Other message"
self.assertEqual("2022-02-28T09:44:58.839",
logsender.get_timestamp(line_1))
self.assertEqual("2022-02-28T09:44:58.839",
logsender.get_timestamp(line_2))
@mock.patch('logscraper.logsender.get_file_info')
def test_get_timestamp_isoformat(self, mock_info):
# logstash.log
line_1 = "2022-03-21T08:39:18.220547Z | Last metadata expiration"
self.assertEqual("2022-03-21T08:39:18.220",
logsender.get_timestamp(line_1))
def test_get_timestamp_textformat(self):
# syslog.log
line_1 = "-- Logs begin at Mon 2022-03-21 09:22:16 UTC"
line_2 = "Mar 21 09:33:23 fedora-rax-dfw-0028920567 sudo[2786]: zuul "
# FIXME: the line_1 should be skipped
self.assertEqual('2022-03-21T09:22:16.000',
logsender.get_timestamp(line_1))
self.assertEqual("2022-03-21T09:33:23.000",
logsender.get_timestamp(line_2))
for (line, expected) in [
("2022-02-28 09:44:58.839036 | Other message",
datetime.datetime(2022, 2, 28, 9, 44, 58, 839036)),
("2022-03-21T08:39:18.220547Z | Last metadata expiration",
datetime.datetime(2022, 3, 21, 8, 39, 18, 220547)),
("Mar 21 09:33:23 fedora-rax-dfw-0028920567 sudo[2786]: zuul ",
datetime.datetime(datetime.date.today().year, 3, 21, 9, 33, 23)),
("2022-03-23T13:09:08.644Z|00040|connmgr|INFO|br-int: added",
datetime.datetime(2022, 3, 23, 13, 9, 8)),
("Friday 25 February 2022 09:27:51 +0000 (0:00:00.056)",
datetime.datetime(2022, 2, 25, 9, 27, 51)),
]:
got = logsender.get_timestamp(line)
self.assertEqual(expected, got)
@mock.patch('ruamel.yaml.YAML.load')
@mock.patch('logscraper.logsender.open_file')

View File

@ -5,5 +5,3 @@ PyYAML<6.1 # MIT
tenacity
opensearch-py<=1.0.0 # Apache-2.0
ruamel.yaml
datefinder # MIT
parsedatetime # Apache-2.0