From 54eb1a0785006fe585316e37627637658dc9b5a8 Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Wed, 22 Nov 2017 14:38:11 -0800 Subject: [PATCH] Collapse logically identical filenames for crm114 Log files come with many names while still containing the same logical content. That may be because the path to them differs (eg /var/log/foo.log and /opt/stack/log/foo.log) or due to file rotations (eg /var/log/foo.log and /var/log/foo.log.1) or due to compression (eg /var/log/foo.log and /var/log/foo.log.gz). At the end of the day these are all the same foo.log log file. This means when we do machine learning on the log files we can collapse all these different cases down into a single case that we learn on. This has become more important with the recent running out of disk space due to all the non unique log paths out there for our log files but should also result in better learning. Change-Id: I4ba276870b73640909ac469b336a436eb127f611 --- files/log-gearman-worker.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/files/log-gearman-worker.py b/files/log-gearman-worker.py index d800079..491b697 100644 --- a/files/log-gearman-worker.py +++ b/files/log-gearman-worker.py @@ -116,9 +116,19 @@ class CRM114FilterFactory(object): def __init__(self, script, basepath): self.script = script self.basepath = basepath + # Precompile regexes + self.re_remove_suffix = re.compile(r'(\.[^a-zA-Z]+)?(\.gz)?$') + self.re_remove_dot = re.compile(r'\.') def create(self, fields): - filename = re.sub('\.', '_', fields['filename']) + # We only want the basename so that the same logfile at different + # paths isn't treated as different + filename = os.path.basename(fields['filename']) + # We want to collapse any numeric or compression suffixes so that + # nova.log and nova.log.1 and nova.log.1.gz are treated as the same + # logical file + filename = self.re_remove_suffix.sub(r'', filename) + filename = self.re_remove_dot.sub('_', filename) path = os.path.join(self.basepath, filename) return CRM114Filter(self.script, path, fields['build_status'])