Add standalone get_timestamp implementation

This change provides a simpler and faster solution to avoid
overloading the service.

Change-Id: Ifbfe1eb6cd2bb130ca34dbdedc0d247db9872353
This commit is contained in:
Tristan Cacqueray 2022-03-28 13:44:36 +00:00 committed by Daniel Pawlik
parent 557cda54d8
commit 5ee54f38a5
4 changed files with 52 additions and 73 deletions

View File

@ -34,6 +34,8 @@ continuous-integration environment, powered by `Zuul
Any changes to logscraper script or tests will trigger jobs to Any changes to logscraper script or tests will trigger jobs to
thoroughly test those changes. thoroughly test those changes.
To run a single test: *tox -epy38 <test_name>*
Benchmarking Benchmarking
------------ ------------

View File

@ -21,13 +21,11 @@ The goal is to get content from build uuid directory and send to Opensearch
import argparse import argparse
import copy import copy
import datefinder
import datetime import datetime
import itertools import itertools
import logging import logging
import multiprocessing import multiprocessing
import os import os
import parsedatetime
import re import re
import shutil import shutil
import sys import sys
@ -189,51 +187,38 @@ def makeFields(build_inventory, buildinfo):
return fields return fields
def alt_parse_timeformat(line): timestamp_patterns = [
"""Return isoformat datetime if all parsing fails""" # 2022-03-25T17:40:37.220547Z
# The paredatetime lib is almost perfect converting timestamp, but (re.compile(r"(\S+)"), "%Y-%m-%dT%H:%M:%S.%fZ"),
# in few logs it increse/decrease year or hour. # 2022-02-28 09:44:58.839036
# Use it as a last hope. (re.compile(r"(\S+ \S+)"), "%Y-%m-%d %H:%M:%S.%f"),
p = parsedatetime.Calendar() # Mar 25 17:40:37 : TODO(tdecacqu): fix the log file format
parse_time = p.parse(line) # because guessing the YEAR is error prone
if parse_time: (re.compile(r"(\S+ \S+ \S+)"), "%b %d %H:%M:%S"),
return datetime.datetime.fromtimestamp( # 2022-03-23T11:46:49+0000 - isoformat
time.mktime(parse_time[0])) (re.compile(r"([0-9-T:]{19})"), "%Y-%m-%dT%H:%M:%S"),
# Friday 25 February 2022 09:27:51 +0000 - ansible
(re.compile(r"(\S+ [0-9]{2} \S+ [0-9: ]{14})"), "%A %d %B %Y %H:%M:%S")
]
def parse_text_timeformat(line): def try_timestamp(regex, fmt, line):
try: try:
# check it time is parsed like: if match := regex.match(line):
# Mar 16 18:41:47 fedora-host some info timestamp_string = match.groups()[0]
timestamp_search = re.search(r'\w{3}\s[0-9]{2}\s[0-9.:]{8}', date = datetime.datetime.strptime(timestamp_string, fmt)
line.split("|", 1)[0]) if date.year == 1900:
if timestamp_search: # Handle missing year
return datetime.datetime.strptime( date = date.replace(year=datetime.date.today().year)
timestamp_search.group(), '%b %d %H:%M:%S').replace( return date
year=datetime.date.today().year) except ValueError:
except TypeError: pass
return alt_parse_timeformat(line)
def find_time_in_line(line):
try:
text_timeformat = parse_text_timeformat(line)
if text_timeformat:
return text_timeformat
found_date = list(datefinder.find_dates(line))
if found_date:
return found_date[0]
except TypeError:
return alt_parse_timeformat(line)
def get_timestamp(line): def get_timestamp(line):
"""Parse log time format like and return in iso format""" for (regex, fmt) in timestamp_patterns:
time = find_time_in_line(line) if res := try_timestamp(regex, fmt, line):
if time and isinstance(time, datetime.datetime): return res
return time.isoformat(sep='T', timespec='milliseconds').replace(
'+00:00', '')
def get_message(line): def get_message(line):
@ -261,12 +246,15 @@ def logline_iter(build_file):
with open_file(build_file) as f: with open_file(build_file) as f:
while True: while True:
line = f.readline() line = f.readline()
if last_known_timestamp is None and line.startswith(
"-- Logs begin at "):
continue
if line: if line:
ts = get_timestamp(line) ts = get_timestamp(line)
if ts: if ts:
last_known_timestamp = ts last_known_timestamp = ts
elif not last_known_timestamp and not ts: elif not last_known_timestamp and not ts:
ts = datetime.datetime.utcnow().isoformat() ts = datetime.datetime.utcnow()
else: else:
ts = last_known_timestamp ts = last_known_timestamp
yield (ts, line) yield (ts, line)
@ -277,7 +265,7 @@ def logline_iter(build_file):
def doc_iter(inner, index, es_fields, doc_type, chunk_size): def doc_iter(inner, index, es_fields, doc_type, chunk_size):
for (ts, line) in inner: for (ts, line) in inner:
fields = copy.deepcopy(es_fields) fields = copy.deepcopy(es_fields)
fields["@timestamp"] = ts fields["@timestamp"] = ts.isoformat()
message = get_message(line) message = get_message(line)
if not message: if not message:

View File

@ -545,9 +545,9 @@ class TestSender(base.TestCase):
@mock.patch('logscraper.logsender.logline_iter') @mock.patch('logscraper.logsender.logline_iter')
def test_doc_iter(self, mock_logline): def test_doc_iter(self, mock_logline):
text = [('2022-02-28T09:39:09.596000', text = [(datetime.datetime(2022, 2, 28, 9, 39, 9, 596000),
'2022-02-28 09:39:09.596010 | Job console starting...\n'), '2022-02-28 09:39:09.596010 | Job console starting...\n'),
('2022-02-28T09:39:09.610000', (datetime.datetime(2022, 2, 28, 9, 39, 9, 610000),
'2022-02-28 09:39:09.610160 | Updating repositories\n')] '2022-02-28 09:39:09.610160 | Updating repositories\n')]
expected_chunk = [{ expected_chunk = [{
'_index': 'someindex', '_index': 'someindex',
@ -577,11 +577,11 @@ class TestSender(base.TestCase):
2022-02-28 09:39:09.996 | Preparing job workspace""" 2022-02-28 09:39:09.996 | Preparing job workspace"""
expected_data = [ expected_data = [
('2022-02-28T09:39:09.596', (datetime.datetime(2022, 2, 28, 9, 39, 9, 596000),
'2022-02-28 09:39:09.596 | Job console starting...\n'), '2022-02-28 09:39:09.596 | Job console starting...\n'),
('2022-02-28T09:39:09.610', (datetime.datetime(2022, 2, 28, 9, 39, 9, 610000),
'2022-02-28 09:39:09.610 | Updating repositories\n'), '2022-02-28 09:39:09.610 | Updating repositories\n'),
('2022-02-28T09:39:09.996', (datetime.datetime(2022, 2, 28, 9, 39, 9, 996000),
'2022-02-28 09:39:09.996 | Preparing job workspace') '2022-02-28 09:39:09.996 | Preparing job workspace')
] ]
readed_data = mock.mock_open(read_data=text) readed_data = mock.mock_open(read_data=text)
@ -608,29 +608,20 @@ class TestSender(base.TestCase):
logsender.get_message(line_2)) logsender.get_message(line_2))
def test_get_timestamp(self): def test_get_timestamp(self):
line_1 = "28-02-2022 09:44:58.839036 | Some message" for (line, expected) in [
line_2 = "2022-02-28 09:44:58.839036 | Other message" ("2022-02-28 09:44:58.839036 | Other message",
self.assertEqual("2022-02-28T09:44:58.839", datetime.datetime(2022, 2, 28, 9, 44, 58, 839036)),
logsender.get_timestamp(line_1)) ("2022-03-21T08:39:18.220547Z | Last metadata expiration",
self.assertEqual("2022-02-28T09:44:58.839", datetime.datetime(2022, 3, 21, 8, 39, 18, 220547)),
logsender.get_timestamp(line_2)) ("Mar 21 09:33:23 fedora-rax-dfw-0028920567 sudo[2786]: zuul ",
datetime.datetime(datetime.date.today().year, 3, 21, 9, 33, 23)),
@mock.patch('logscraper.logsender.get_file_info') ("2022-03-23T13:09:08.644Z|00040|connmgr|INFO|br-int: added",
def test_get_timestamp_isoformat(self, mock_info): datetime.datetime(2022, 3, 23, 13, 9, 8)),
# logstash.log ("Friday 25 February 2022 09:27:51 +0000 (0:00:00.056)",
line_1 = "2022-03-21T08:39:18.220547Z | Last metadata expiration" datetime.datetime(2022, 2, 25, 9, 27, 51)),
self.assertEqual("2022-03-21T08:39:18.220", ]:
logsender.get_timestamp(line_1)) got = logsender.get_timestamp(line)
self.assertEqual(expected, got)
def test_get_timestamp_textformat(self):
# syslog.log
line_1 = "-- Logs begin at Mon 2022-03-21 09:22:16 UTC"
line_2 = "Mar 21 09:33:23 fedora-rax-dfw-0028920567 sudo[2786]: zuul "
# FIXME: the line_1 should be skipped
self.assertEqual('2022-03-21T09:22:16.000',
logsender.get_timestamp(line_1))
self.assertEqual("2022-03-21T09:33:23.000",
logsender.get_timestamp(line_2))
@mock.patch('ruamel.yaml.YAML.load') @mock.patch('ruamel.yaml.YAML.load')
@mock.patch('logscraper.logsender.open_file') @mock.patch('logscraper.logsender.open_file')

View File

@ -5,5 +5,3 @@ PyYAML<6.1 # MIT
tenacity tenacity
opensearch-py<=1.0.0 # Apache-2.0 opensearch-py<=1.0.0 # Apache-2.0
ruamel.yaml ruamel.yaml
datefinder # MIT
parsedatetime # Apache-2.0