Add standalone get_timestamp implementation
This change provides a simpler and faster solution to avoid overloading the service. Change-Id: Ifbfe1eb6cd2bb130ca34dbdedc0d247db9872353
This commit is contained in:
parent
557cda54d8
commit
5ee54f38a5
|
@ -34,6 +34,8 @@ continuous-integration environment, powered by `Zuul
|
|||
Any changes to logscraper script or tests will trigger jobs to
|
||||
thoroughly test those changes.
|
||||
|
||||
To run a single test: *tox -epy38 <test_name>*
|
||||
|
||||
|
||||
Benchmarking
|
||||
------------
|
||||
|
|
|
@ -21,13 +21,11 @@ The goal is to get content from build uuid directory and send to Opensearch
|
|||
|
||||
import argparse
|
||||
import copy
|
||||
import datefinder
|
||||
import datetime
|
||||
import itertools
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import parsedatetime
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
|
@ -189,51 +187,38 @@ def makeFields(build_inventory, buildinfo):
|
|||
return fields
|
||||
|
||||
|
||||
def alt_parse_timeformat(line):
|
||||
"""Return isoformat datetime if all parsing fails"""
|
||||
# The paredatetime lib is almost perfect converting timestamp, but
|
||||
# in few logs it increse/decrease year or hour.
|
||||
# Use it as a last hope.
|
||||
p = parsedatetime.Calendar()
|
||||
parse_time = p.parse(line)
|
||||
if parse_time:
|
||||
return datetime.datetime.fromtimestamp(
|
||||
time.mktime(parse_time[0]))
|
||||
timestamp_patterns = [
|
||||
# 2022-03-25T17:40:37.220547Z
|
||||
(re.compile(r"(\S+)"), "%Y-%m-%dT%H:%M:%S.%fZ"),
|
||||
# 2022-02-28 09:44:58.839036
|
||||
(re.compile(r"(\S+ \S+)"), "%Y-%m-%d %H:%M:%S.%f"),
|
||||
# Mar 25 17:40:37 : TODO(tdecacqu): fix the log file format
|
||||
# because guessing the YEAR is error prone
|
||||
(re.compile(r"(\S+ \S+ \S+)"), "%b %d %H:%M:%S"),
|
||||
# 2022-03-23T11:46:49+0000 - isoformat
|
||||
(re.compile(r"([0-9-T:]{19})"), "%Y-%m-%dT%H:%M:%S"),
|
||||
# Friday 25 February 2022 09:27:51 +0000 - ansible
|
||||
(re.compile(r"(\S+ [0-9]{2} \S+ [0-9: ]{14})"), "%A %d %B %Y %H:%M:%S")
|
||||
]
|
||||
|
||||
|
||||
def parse_text_timeformat(line):
|
||||
def try_timestamp(regex, fmt, line):
|
||||
try:
|
||||
# check it time is parsed like:
|
||||
# Mar 16 18:41:47 fedora-host some info
|
||||
timestamp_search = re.search(r'\w{3}\s[0-9]{2}\s[0-9.:]{8}',
|
||||
line.split("|", 1)[0])
|
||||
if timestamp_search:
|
||||
return datetime.datetime.strptime(
|
||||
timestamp_search.group(), '%b %d %H:%M:%S').replace(
|
||||
year=datetime.date.today().year)
|
||||
except TypeError:
|
||||
return alt_parse_timeformat(line)
|
||||
|
||||
|
||||
def find_time_in_line(line):
|
||||
try:
|
||||
text_timeformat = parse_text_timeformat(line)
|
||||
if text_timeformat:
|
||||
return text_timeformat
|
||||
|
||||
found_date = list(datefinder.find_dates(line))
|
||||
if found_date:
|
||||
return found_date[0]
|
||||
except TypeError:
|
||||
return alt_parse_timeformat(line)
|
||||
if match := regex.match(line):
|
||||
timestamp_string = match.groups()[0]
|
||||
date = datetime.datetime.strptime(timestamp_string, fmt)
|
||||
if date.year == 1900:
|
||||
# Handle missing year
|
||||
date = date.replace(year=datetime.date.today().year)
|
||||
return date
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def get_timestamp(line):
|
||||
"""Parse log time format like and return in iso format"""
|
||||
time = find_time_in_line(line)
|
||||
if time and isinstance(time, datetime.datetime):
|
||||
return time.isoformat(sep='T', timespec='milliseconds').replace(
|
||||
'+00:00', '')
|
||||
for (regex, fmt) in timestamp_patterns:
|
||||
if res := try_timestamp(regex, fmt, line):
|
||||
return res
|
||||
|
||||
|
||||
def get_message(line):
|
||||
|
@ -261,12 +246,15 @@ def logline_iter(build_file):
|
|||
with open_file(build_file) as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if last_known_timestamp is None and line.startswith(
|
||||
"-- Logs begin at "):
|
||||
continue
|
||||
if line:
|
||||
ts = get_timestamp(line)
|
||||
if ts:
|
||||
last_known_timestamp = ts
|
||||
elif not last_known_timestamp and not ts:
|
||||
ts = datetime.datetime.utcnow().isoformat()
|
||||
ts = datetime.datetime.utcnow()
|
||||
else:
|
||||
ts = last_known_timestamp
|
||||
yield (ts, line)
|
||||
|
@ -277,7 +265,7 @@ def logline_iter(build_file):
|
|||
def doc_iter(inner, index, es_fields, doc_type, chunk_size):
|
||||
for (ts, line) in inner:
|
||||
fields = copy.deepcopy(es_fields)
|
||||
fields["@timestamp"] = ts
|
||||
fields["@timestamp"] = ts.isoformat()
|
||||
|
||||
message = get_message(line)
|
||||
if not message:
|
||||
|
|
|
@ -545,9 +545,9 @@ class TestSender(base.TestCase):
|
|||
|
||||
@mock.patch('logscraper.logsender.logline_iter')
|
||||
def test_doc_iter(self, mock_logline):
|
||||
text = [('2022-02-28T09:39:09.596000',
|
||||
text = [(datetime.datetime(2022, 2, 28, 9, 39, 9, 596000),
|
||||
'2022-02-28 09:39:09.596010 | Job console starting...\n'),
|
||||
('2022-02-28T09:39:09.610000',
|
||||
(datetime.datetime(2022, 2, 28, 9, 39, 9, 610000),
|
||||
'2022-02-28 09:39:09.610160 | Updating repositories\n')]
|
||||
expected_chunk = [{
|
||||
'_index': 'someindex',
|
||||
|
@ -577,11 +577,11 @@ class TestSender(base.TestCase):
|
|||
2022-02-28 09:39:09.996 | Preparing job workspace"""
|
||||
|
||||
expected_data = [
|
||||
('2022-02-28T09:39:09.596',
|
||||
(datetime.datetime(2022, 2, 28, 9, 39, 9, 596000),
|
||||
'2022-02-28 09:39:09.596 | Job console starting...\n'),
|
||||
('2022-02-28T09:39:09.610',
|
||||
(datetime.datetime(2022, 2, 28, 9, 39, 9, 610000),
|
||||
'2022-02-28 09:39:09.610 | Updating repositories\n'),
|
||||
('2022-02-28T09:39:09.996',
|
||||
(datetime.datetime(2022, 2, 28, 9, 39, 9, 996000),
|
||||
'2022-02-28 09:39:09.996 | Preparing job workspace')
|
||||
]
|
||||
readed_data = mock.mock_open(read_data=text)
|
||||
|
@ -608,29 +608,20 @@ class TestSender(base.TestCase):
|
|||
logsender.get_message(line_2))
|
||||
|
||||
def test_get_timestamp(self):
|
||||
line_1 = "28-02-2022 09:44:58.839036 | Some message"
|
||||
line_2 = "2022-02-28 09:44:58.839036 | Other message"
|
||||
self.assertEqual("2022-02-28T09:44:58.839",
|
||||
logsender.get_timestamp(line_1))
|
||||
self.assertEqual("2022-02-28T09:44:58.839",
|
||||
logsender.get_timestamp(line_2))
|
||||
|
||||
@mock.patch('logscraper.logsender.get_file_info')
|
||||
def test_get_timestamp_isoformat(self, mock_info):
|
||||
# logstash.log
|
||||
line_1 = "2022-03-21T08:39:18.220547Z | Last metadata expiration"
|
||||
self.assertEqual("2022-03-21T08:39:18.220",
|
||||
logsender.get_timestamp(line_1))
|
||||
|
||||
def test_get_timestamp_textformat(self):
|
||||
# syslog.log
|
||||
line_1 = "-- Logs begin at Mon 2022-03-21 09:22:16 UTC"
|
||||
line_2 = "Mar 21 09:33:23 fedora-rax-dfw-0028920567 sudo[2786]: zuul "
|
||||
# FIXME: the line_1 should be skipped
|
||||
self.assertEqual('2022-03-21T09:22:16.000',
|
||||
logsender.get_timestamp(line_1))
|
||||
self.assertEqual("2022-03-21T09:33:23.000",
|
||||
logsender.get_timestamp(line_2))
|
||||
for (line, expected) in [
|
||||
("2022-02-28 09:44:58.839036 | Other message",
|
||||
datetime.datetime(2022, 2, 28, 9, 44, 58, 839036)),
|
||||
("2022-03-21T08:39:18.220547Z | Last metadata expiration",
|
||||
datetime.datetime(2022, 3, 21, 8, 39, 18, 220547)),
|
||||
("Mar 21 09:33:23 fedora-rax-dfw-0028920567 sudo[2786]: zuul ",
|
||||
datetime.datetime(datetime.date.today().year, 3, 21, 9, 33, 23)),
|
||||
("2022-03-23T13:09:08.644Z|00040|connmgr|INFO|br-int: added",
|
||||
datetime.datetime(2022, 3, 23, 13, 9, 8)),
|
||||
("Friday 25 February 2022 09:27:51 +0000 (0:00:00.056)",
|
||||
datetime.datetime(2022, 2, 25, 9, 27, 51)),
|
||||
]:
|
||||
got = logsender.get_timestamp(line)
|
||||
self.assertEqual(expected, got)
|
||||
|
||||
@mock.patch('ruamel.yaml.YAML.load')
|
||||
@mock.patch('logscraper.logsender.open_file')
|
||||
|
|
|
@ -5,5 +5,3 @@ PyYAML<6.1 # MIT
|
|||
tenacity
|
||||
opensearch-py<=1.0.0 # Apache-2.0
|
||||
ruamel.yaml
|
||||
datefinder # MIT
|
||||
parsedatetime # Apache-2.0
|
||||
|
|
Loading…
Reference in New Issue