Merge "Refactor log parsing into separate parser classes"
This commit is contained in:
commit
e2a4066918
|
@ -78,85 +78,119 @@ FILE_MAP = {
|
||||||
|
|
||||||
|
|
||||||
class LogEntry(object):
|
class LogEntry(object):
|
||||||
separator = ' '
|
def __init__(self, alias, dt, data, dt_str=None):
|
||||||
date_format = None
|
self.alias = alias
|
||||||
_date_parse_msg = 'unconverted data remains: '
|
self.dt = dt
|
||||||
|
self.data = data
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
if dt_str is not None:
|
||||||
self._date_length = None
|
self.dt_str = dt_str
|
||||||
self.__dict__.update(**kwargs)
|
else:
|
||||||
|
self.dt_str = self.dt.strftime('%Y-%m-%d %H:%M:%S.%f')
|
||||||
@classmethod
|
|
||||||
def get_init_args(cls, filename):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def prepare_line(self, line):
|
|
||||||
return line.replace('\0', ' ')
|
|
||||||
|
|
||||||
def parse_date(self, line):
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(line, self.date_format)
|
|
||||||
except ValueError as e:
|
|
||||||
if not e.args[0].startswith(self._date_parse_msg):
|
|
||||||
raise
|
|
||||||
prepared_date_length = (len(line) - len(e.args[0]) +
|
|
||||||
len(self._date_parse_msg))
|
|
||||||
dt = datetime.strptime(line[:prepared_date_length],
|
|
||||||
self.date_format)
|
|
||||||
self._date_length = prepared_date_length
|
|
||||||
return dt
|
|
||||||
|
|
||||||
def _calculate_date_length(self):
|
|
||||||
return len(self.date.strftime(self.date_format))
|
|
||||||
|
|
||||||
@property
|
|
||||||
def date_length(self):
|
|
||||||
if not self._date_length:
|
|
||||||
self._date_length = self._calculate_date_length()
|
|
||||||
return self._date_length
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def factory(cls, filename, line, **kwargs):
|
|
||||||
self = cls(**kwargs)
|
|
||||||
|
|
||||||
self.filename = filename
|
|
||||||
if not line:
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
# Prepare the line for date parsing
|
|
||||||
prepared_line = self.prepare_line(line)
|
|
||||||
|
|
||||||
# Extract the datetime
|
|
||||||
self.date = self.parse_date(prepared_line)
|
|
||||||
|
|
||||||
if (len(line) == self.date_length or
|
|
||||||
line[self.date_length] != self.separator):
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
self.date_str = line[:self.date_length]
|
|
||||||
# +1 to remove the separator so we don't have 2 spaces on output
|
|
||||||
self.data = line[self.date_length + 1:]
|
|
||||||
return self
|
|
||||||
|
|
||||||
def append_line(self, line):
|
def append_line(self, line):
|
||||||
self.data += EXTRALINES_PADDING + line
|
self.data += EXTRALINES_PADDING + line
|
||||||
|
|
||||||
def __cmp__(self, other):
|
def __cmp__(self, other):
|
||||||
return cmp(self.date, other.date)
|
return cmp(self.dt, other.dt)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '%s [%s] %s' % (self.dt_str, self.alias, self.data.rstrip('\n'))
|
||||||
|
|
||||||
|
|
||||||
|
class LogParser(object):
|
||||||
|
def parse_line(self, line):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class StrptimeParser(LogParser):
|
||||||
|
date_format = None
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.date_format_words = len(self.date_format.split(' '))
|
||||||
|
|
||||||
|
def parse_line(self, line):
|
||||||
|
# Split the input line into words, up to <date_format_words>. Data is
|
||||||
|
# anything after that. Join the first <date_format_words> words to
|
||||||
|
# recreate the date.
|
||||||
|
dt_str = line.split(' ', self.date_format_words)
|
||||||
|
data = dt_str.pop()
|
||||||
|
dt_str = ' '.join(dt_str)
|
||||||
|
|
||||||
|
dt = datetime.strptime(dt_str, self.date_format)
|
||||||
|
|
||||||
|
# +1 to remove the separator so we don't have 2 spaces on output
|
||||||
|
return dt, dt_str, data
|
||||||
|
|
||||||
|
|
||||||
|
class OSLogParser(StrptimeParser):
|
||||||
|
"""OpenStack default log: 2016-02-01 10:22:59.239"""
|
||||||
|
date_format = '%Y-%m-%d %H:%M:%S.%f'
|
||||||
|
|
||||||
|
|
||||||
|
class MsgLogParser(StrptimeParser):
|
||||||
|
"""Message format: Oct 15 14:11:19"""
|
||||||
|
date_format = '%b %d %H:%M:%S'
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
super(MsgLogParser, self).__init__(filename)
|
||||||
|
stat = os.stat(filename)
|
||||||
|
|
||||||
|
# TODO: handle the case where log file was closed after a year boundary
|
||||||
|
log_modified = datetime.fromtimestamp(stat.st_mtime)
|
||||||
|
self.year = log_modified.year
|
||||||
|
|
||||||
|
def parse_line(self, line):
|
||||||
|
dt, dt_str, data = super(MsgLogParser, self).parse_line(line)
|
||||||
|
return dt.replace(self.year), dt_str, data
|
||||||
|
|
||||||
|
|
||||||
|
class TSLogParser(LogParser):
|
||||||
|
"""Timestamped log: [275514.814982]"""
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
stat = os.stat(filename)
|
||||||
|
mtime = datetime.fromtimestamp(stat.st_mtime)
|
||||||
|
timestamp = self._get_last_timestamp(filename)
|
||||||
|
self.start_date = mtime - timedelta(seconds=timestamp)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_last_timestamp(cls, filename):
|
||||||
|
result = None
|
||||||
|
with open(filename, 'r') as f:
|
||||||
|
file_size = os.fstat(f.fileno()).st_size
|
||||||
|
# We will jump to the last KB so we don't have to read all file
|
||||||
|
offset = max(0, file_size - 1024)
|
||||||
|
f.seek(offset)
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
__, result = cls._read_timestamp(line)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _read_timestamp(line):
|
||||||
|
start = line.index('[') + 1
|
||||||
|
end = line.index(']')
|
||||||
|
|
||||||
|
if end < start:
|
||||||
|
raise ValueError
|
||||||
|
|
||||||
|
return end, float(line[start:end])
|
||||||
|
|
||||||
|
def parse_line(self, line):
|
||||||
|
end, timestamp = self._read_timestamp(line)
|
||||||
|
dt = self.start_date + timedelta(seconds=timestamp)
|
||||||
|
return dt, line[:end + 1], line[end + 1:]
|
||||||
|
|
||||||
|
|
||||||
class LogFile(object):
|
class LogFile(object):
|
||||||
log_entry_class = LogEntry
|
def __init__(self, filename, alias, parser_cls):
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def factory(cls, filename):
|
|
||||||
instance = LogFile(filename)
|
|
||||||
instance.log_entry_class = cls
|
|
||||||
instance.entry_kwargs = cls.get_init_args(filename)
|
|
||||||
return instance
|
|
||||||
|
|
||||||
def __init__(self, filename):
|
|
||||||
self.open(filename)
|
self.open(filename)
|
||||||
|
self.alias = alias
|
||||||
|
self.parser = parser_cls(filename)
|
||||||
|
|
||||||
def open(self, filename):
|
def open(self, filename):
|
||||||
self._filename = filename
|
self._filename = filename
|
||||||
|
@ -164,8 +198,6 @@ class LogFile(object):
|
||||||
filename = self._cached_download(filename)
|
filename = self._cached_download(filename)
|
||||||
|
|
||||||
self._file = open(filename, 'r')
|
self._file = open(filename, 'r')
|
||||||
stat = os.stat(filename)
|
|
||||||
self.mtime = datetime.fromtimestamp(stat.st_mtime)
|
|
||||||
|
|
||||||
def _url_cache_path(self, url):
|
def _url_cache_path(self, url):
|
||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
|
@ -207,18 +239,16 @@ class LogFile(object):
|
||||||
line = self._file.readline()
|
line = self._file.readline()
|
||||||
if line == "":
|
if line == "":
|
||||||
return entry, None
|
return entry, None
|
||||||
|
line.replace('\0', ' ')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
new_entry = self.log_entry_class.factory(self._filename,
|
dt, dt_str, data = self.parser.parse_line(line)
|
||||||
line,
|
new_entry = LogEntry(self.alias, dt, data, dt_str=dt_str)
|
||||||
**self.entry_kwargs)
|
|
||||||
if new_entry is None:
|
|
||||||
continue
|
|
||||||
if entry:
|
if entry:
|
||||||
return entry, new_entry
|
return entry, new_entry
|
||||||
entry = new_entry
|
entry = new_entry
|
||||||
|
|
||||||
except Exception:
|
except ValueError:
|
||||||
# it's probably a non-dated line, or a garbled entry, just
|
# it's probably a non-dated line, or a garbled entry, just
|
||||||
# append to the entry extra info
|
# append to the entry extra info
|
||||||
if entry:
|
if entry:
|
||||||
|
@ -247,104 +277,37 @@ class LogFile(object):
|
||||||
return cmp(self.peek(), other.peek())
|
return cmp(self.peek(), other.peek())
|
||||||
|
|
||||||
|
|
||||||
class MsgLogEntry(LogEntry):
|
LOG_TYPES = {
|
||||||
"""Message format: Oct 15 14:11:19"""
|
'logfiles': OSLogParser,
|
||||||
date_format = '%Y%b %d %H:%M:%S'
|
'logfiles_m': MsgLogParser,
|
||||||
|
'logfiles_t': TSLogParser,
|
||||||
@classmethod
|
}
|
||||||
def get_init_args(cls, filename):
|
|
||||||
kwargs = super(MsgLogEntry, cls).get_init_args(filename)
|
|
||||||
stat = os.stat(filename)
|
|
||||||
kwargs['file_year'] = datetime.fromtimestamp(stat.st_mtime).year
|
|
||||||
return kwargs
|
|
||||||
|
|
||||||
def prepare_line(self, line):
|
|
||||||
# TODO: If year of file creation and file last modification are
|
|
||||||
# different we should start with the cration year and then change to
|
|
||||||
# the next year once the months go back.
|
|
||||||
line = super(MsgLogEntry, self).prepare_line(line)
|
|
||||||
return '%s%s' % (self.file_year, line)
|
|
||||||
|
|
||||||
def _calculate_date_length(self):
|
|
||||||
return super(MsgLogEntry, self)._calculate_date_length() - 4
|
|
||||||
|
|
||||||
|
|
||||||
class OSLogEntry(LogEntry):
|
|
||||||
"""OpenStack default log: 2016-02-01 10:22:59.239"""
|
|
||||||
date_format = '%Y-%m-%d %H:%M:%S.%f'
|
|
||||||
|
|
||||||
def _calculate_date_length(self):
|
|
||||||
return super(OSLogEntry, self)._calculate_date_length() - 3
|
|
||||||
|
|
||||||
|
|
||||||
class TSLogEntry(LogEntry):
|
|
||||||
"""Timestamped log: [275514.814982]"""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_init_args(cls, filename):
|
|
||||||
kwargs = super(TSLogEntry, cls).get_init_args(filename)
|
|
||||||
stat = os.stat(filename)
|
|
||||||
mtime = datetime.fromtimestamp(stat.st_mtime)
|
|
||||||
timestamp = cls._get_last_timestamp(filename)
|
|
||||||
kwargs['start_date'] = mtime - timedelta(seconds=timestamp)
|
|
||||||
return kwargs
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_last_timestamp(cls, filename):
|
|
||||||
result = None
|
|
||||||
with open(filename, 'r') as f:
|
|
||||||
file_size = os.fstat(f.fileno()).st_size
|
|
||||||
# We will jump to the last KB so we don't have to read all file
|
|
||||||
offset = max(0, file_size - 1024)
|
|
||||||
f.seek(offset)
|
|
||||||
for line in f:
|
|
||||||
try:
|
|
||||||
__, result = cls._read_timestamp(line)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _read_timestamp(line):
|
|
||||||
start = line.index('[') + 1
|
|
||||||
end = line.index(']')
|
|
||||||
|
|
||||||
if end < start:
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
return end, float(line[start:end])
|
|
||||||
|
|
||||||
def parse_date(self, date_str):
|
|
||||||
end, timestamp = self._read_timestamp(date_str)
|
|
||||||
self._date_length = end + 1
|
|
||||||
return self.start_date + timedelta(seconds=timestamp)
|
|
||||||
|
|
||||||
|
|
||||||
LOG_TYPES = [
|
|
||||||
('logfiles', OSLogEntry),
|
|
||||||
('logfiles_m', MsgLogEntry),
|
|
||||||
('logfiles_t', TSLogEntry),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def process_logs(cfg):
|
def process_logs(cfg):
|
||||||
filename_alias = {}
|
filename_alias = {}
|
||||||
logs = []
|
logs = []
|
||||||
for arg_name, entry_cls in LOG_TYPES:
|
|
||||||
for filename in getattr(cfg, arg_name):
|
|
||||||
path, alias, is_url = get_path_and_alias(filename,
|
|
||||||
cfg.log_base,
|
|
||||||
cfg.log_postfix)
|
|
||||||
filename_alias[path] = (filename, alias, is_url)
|
|
||||||
logs.append(LogFile.factory(entry_cls, path))
|
|
||||||
|
|
||||||
alias = generate_aliases(filename_alias, cfg)
|
paths_aliases = {}
|
||||||
|
paths_parsers = {}
|
||||||
|
for arg_name, parser_cls in LOG_TYPES.items():
|
||||||
|
for filename in getattr(cfg, arg_name):
|
||||||
|
path, alias, is_url = get_path_and_alias(filename, cfg.log_base,
|
||||||
|
cfg.log_postfix)
|
||||||
|
paths_aliases[path] = (filename, alias, is_url)
|
||||||
|
paths_parsers[path] = parser_cls
|
||||||
|
|
||||||
|
# NOTE(mdbooth): I feel like generate_aliases should take a single path,
|
||||||
|
# which would make this loop much tidier. I don't want to unpick it right
|
||||||
|
# now, though.
|
||||||
|
aliases = generate_aliases(paths_aliases, cfg)
|
||||||
|
|
||||||
|
logs = [LogFile(path, aliases[path], parser_cls)
|
||||||
|
for path, parser_cls in paths_parsers.items()]
|
||||||
|
|
||||||
entry_iters = [iter(log) for log in logs]
|
entry_iters = [iter(log) for log in logs]
|
||||||
for entry in heapq.merge(*entry_iters):
|
for entry in heapq.merge(*entry_iters):
|
||||||
print('%s [%s] %s' % (entry.date_str, alias[entry.filename],
|
print(entry)
|
||||||
entry.data.rstrip('\n')))
|
|
||||||
|
|
||||||
|
|
||||||
def get_path_and_alias(filename, log_base, log_postfix):
|
def get_path_and_alias(filename, log_base, log_postfix):
|
||||||
|
|
Loading…
Reference in New Issue