Browse Source

Refactor log parsing into separate parser classes

All log entries are now the lighter weight LogEntry class, which is
also responsible for its own output. Parsing is simplified and
separated into independent parser classes.

Change-Id: I264cf20933e8af007556efd7a36639f854460f49
Matthew Booth 1 year ago
parent
commit
27599915cb
1 changed files with 120 additions and 157 deletions
  1. 120
    157
      oslogmerger/oslogmerger.py

+ 120
- 157
oslogmerger/oslogmerger.py View File

@@ -78,85 +78,119 @@ FILE_MAP = {
78 78
 
79 79
 
80 80
 class LogEntry(object):
81
-    separator = ' '
82
-    date_format = None
83
-    _date_parse_msg = 'unconverted data remains: '
81
+    def __init__(self, alias, dt, data, dt_str=None):
82
+        self.alias = alias
83
+        self.dt = dt
84
+        self.data = data
84 85
 
85
-    def __init__(self, **kwargs):
86
-        self._date_length = None
87
-        self.__dict__.update(**kwargs)
86
+        if dt_str is not None:
87
+            self.dt_str = dt_str
88
+        else:
89
+            self.dt_str = self.dt.strftime('%Y-%m-%d %H:%M:%S.%f')
88 90
 
89
-    @classmethod
90
-    def get_init_args(cls, filename):
91
-        return {}
92
-
93
-    def prepare_line(self, line):
94
-        return line.replace('\0', ' ')
95
-
96
-    def parse_date(self, line):
97
-        try:
98
-            dt = datetime.strptime(line, self.date_format)
99
-        except ValueError as e:
100
-            if not e.args[0].startswith(self._date_parse_msg):
101
-                raise
102
-            prepared_date_length = (len(line) - len(e.args[0]) +
103
-                                    len(self._date_parse_msg))
104
-            dt = datetime.strptime(line[:prepared_date_length],
105
-                                   self.date_format)
106
-            self._date_length = prepared_date_length
107
-        return dt
108
-
109
-    def _calculate_date_length(self):
110
-        return len(self.date.strftime(self.date_format))
111
-
112
-    @property
113
-    def date_length(self):
114
-        if not self._date_length:
115
-            self._date_length = self._calculate_date_length()
116
-        return self._date_length
91
+    def append_line(self, line):
92
+        self.data += EXTRALINES_PADDING + line
117 93
 
118
-    @classmethod
119
-    def factory(cls, filename, line, **kwargs):
120
-        self = cls(**kwargs)
94
+    def __cmp__(self, other):
95
+        return cmp(self.dt, other.dt)
121 96
 
122
-        self.filename = filename
123
-        if not line:
124
-            raise ValueError
97
+    def __str__(self):
98
+        return '%s [%s] %s' % (self.dt_str, self.alias, self.data.rstrip('\n'))
125 99
 
126
-        # Prepare the line for date parsing
127
-        prepared_line = self.prepare_line(line)
128 100
 
129
-        # Extract the datetime
130
-        self.date = self.parse_date(prepared_line)
101
+class LogParser(object):
102
+    def parse_line(self, line):
103
+        raise NotImplementedError
131 104
 
132
-        if (len(line) == self.date_length or
133
-                line[self.date_length] != self.separator):
134
-            raise ValueError
135 105
 
136
-        self.date_str = line[:self.date_length]
106
+class StrptimeParser(LogParser):
107
+    date_format = None
108
+
109
+    def __init__(self, filename):
110
+        self.date_format_words = len(self.date_format.split(' '))
111
+
112
+    def parse_line(self, line):
113
+        # Split the input line into words, up to <date_format_words>. Data is
114
+        # anything after that. Join the first <date_format_words> words to
115
+        # recreate the date.
116
+        dt_str = line.split(' ', self.date_format_words)
117
+        data = dt_str.pop()
118
+        dt_str = ' '.join(dt_str)
119
+
120
+        dt = datetime.strptime(dt_str, self.date_format)
121
+
137 122
         # +1 to remove the separator so we don't have 2 spaces on output
138
-        self.data = line[self.date_length + 1:]
139
-        return self
123
+        return dt, dt_str, data
140 124
 
141
-    def append_line(self, line):
142
-        self.data += EXTRALINES_PADDING + line
143 125
 
144
-    def __cmp__(self, other):
145
-        return cmp(self.date, other.date)
126
+class OSLogParser(StrptimeParser):
127
+    """OpenStack default log: 2016-02-01 10:22:59.239"""
128
+    date_format = '%Y-%m-%d %H:%M:%S.%f'
146 129
 
147 130
 
148
-class LogFile(object):
149
-    log_entry_class = LogEntry
131
+class MsgLogParser(StrptimeParser):
132
+    """Message format: Oct 15 14:11:19"""
133
+    date_format = '%b %d %H:%M:%S'
150 134
 
151
-    @staticmethod
152
-    def factory(cls, filename):
153
-        instance = LogFile(filename)
154
-        instance.log_entry_class = cls
155
-        instance.entry_kwargs = cls.get_init_args(filename)
156
-        return instance
135
+    def __init__(self, filename):
136
+        super(MsgLogParser, self).__init__(filename)
137
+        stat = os.stat(filename)
138
+
139
+        # TODO: handle the case where log file was closed after a year boundary
140
+        log_modified = datetime.fromtimestamp(stat.st_mtime)
141
+        self.year = log_modified.year
142
+
143
+    def parse_line(self, line):
144
+        dt, dt_str, data = super(MsgLogParser, self).parse_line(line)
145
+        return dt.replace(self.year), dt_str, data
146
+
147
+
148
+class TSLogParser(LogParser):
149
+    """Timestamped log: [275514.814982]"""
157 150
 
158 151
     def __init__(self, filename):
152
+        stat = os.stat(filename)
153
+        mtime = datetime.fromtimestamp(stat.st_mtime)
154
+        timestamp = self._get_last_timestamp(filename)
155
+        self.start_date = mtime - timedelta(seconds=timestamp)
156
+
157
+    @classmethod
158
+    def _get_last_timestamp(cls, filename):
159
+        result = None
160
+        with open(filename, 'r') as f:
161
+            file_size = os.fstat(f.fileno()).st_size
162
+            # We will jump to the last KB so we don't have to read all file
163
+            offset = max(0, file_size - 1024)
164
+            f.seek(offset)
165
+            for line in f:
166
+                try:
167
+                    __, result = cls._read_timestamp(line)
168
+                except ValueError:
169
+                    continue
170
+
171
+            return result
172
+
173
+    @staticmethod
174
+    def _read_timestamp(line):
175
+        start = line.index('[') + 1
176
+        end = line.index(']')
177
+
178
+        if end < start:
179
+            raise ValueError
180
+
181
+        return end, float(line[start:end])
182
+
183
+    def parse_line(self, line):
184
+        end, timestamp = self._read_timestamp(line)
185
+        dt = self.start_date + timedelta(seconds=timestamp)
186
+        return dt, line[:end + 1], line[end + 1:]
187
+
188
+
189
+class LogFile(object):
190
+    def __init__(self, filename, alias, parser_cls):
159 191
         self.open(filename)
192
+        self.alias = alias
193
+        self.parser = parser_cls(filename)
160 194
 
161 195
     def open(self, filename):
162 196
         self._filename = filename
@@ -164,8 +198,6 @@ class LogFile(object):
164 198
             filename = self._cached_download(filename)
165 199
 
166 200
         self._file = open(filename, 'r')
167
-        stat = os.stat(filename)
168
-        self.mtime = datetime.fromtimestamp(stat.st_mtime)
169 201
 
170 202
     def _url_cache_path(self, url):
171 203
         md5 = hashlib.md5()
@@ -207,18 +239,16 @@ class LogFile(object):
207 239
             line = self._file.readline()
208 240
             if line == "":
209 241
                 return entry, None
242
+            line.replace('\0', ' ')
210 243
 
211 244
             try:
212
-                new_entry = self.log_entry_class.factory(self._filename,
213
-                                                         line,
214
-                                                         **self.entry_kwargs)
215
-                if new_entry is None:
216
-                    continue
245
+                dt, dt_str, data = self.parser.parse_line(line)
246
+                new_entry = LogEntry(self.alias, dt, data, dt_str=dt_str)
217 247
                 if entry:
218 248
                     return entry, new_entry
219 249
                 entry = new_entry
220 250
 
221
-            except Exception:
251
+            except ValueError:
222 252
                 # it's probably a non-dated line, or a garbled entry, just
223 253
                 # append to the entry extra info
224 254
                 if entry:
@@ -247,104 +277,37 @@ class LogFile(object):
247 277
         return cmp(self.peek(), other.peek())
248 278
 
249 279
 
250
-class MsgLogEntry(LogEntry):
251
-    """Message format: Oct 15 14:11:19"""
252
-    date_format = '%Y%b %d %H:%M:%S'
253
-
254
-    @classmethod
255
-    def get_init_args(cls, filename):
256
-        kwargs = super(MsgLogEntry, cls).get_init_args(filename)
257
-        stat = os.stat(filename)
258
-        kwargs['file_year'] = datetime.fromtimestamp(stat.st_mtime).year
259
-        return kwargs
260
-
261
-    def prepare_line(self, line):
262
-        # TODO: If year of file creation and file last modification are
263
-        # different we should start with the cration year and then change to
264
-        # the next year once the months go back.
265
-        line = super(MsgLogEntry, self).prepare_line(line)
266
-        return '%s%s' % (self.file_year, line)
267
-
268
-    def _calculate_date_length(self):
269
-        return super(MsgLogEntry, self)._calculate_date_length() - 4
270
-
271
-
272
-class OSLogEntry(LogEntry):
273
-    """OpenStack default log: 2016-02-01 10:22:59.239"""
274
-    date_format = '%Y-%m-%d %H:%M:%S.%f'
275
-
276
-    def _calculate_date_length(self):
277
-        return super(OSLogEntry, self)._calculate_date_length() - 3
278
-
279
-
280
-class TSLogEntry(LogEntry):
281
-    """Timestamped log: [275514.814982]"""
282
-
283
-    @classmethod
284
-    def get_init_args(cls, filename):
285
-        kwargs = super(TSLogEntry, cls).get_init_args(filename)
286
-        stat = os.stat(filename)
287
-        mtime = datetime.fromtimestamp(stat.st_mtime)
288
-        timestamp = cls._get_last_timestamp(filename)
289
-        kwargs['start_date'] = mtime - timedelta(seconds=timestamp)
290
-        return kwargs
291
-
292
-    @classmethod
293
-    def _get_last_timestamp(cls, filename):
294
-        result = None
295
-        with open(filename, 'r') as f:
296
-            file_size = os.fstat(f.fileno()).st_size
297
-            # We will jump to the last KB so we don't have to read all file
298
-            offset = max(0, file_size - 1024)
299
-            f.seek(offset)
300
-            for line in f:
301
-                try:
302
-                    __, result = cls._read_timestamp(line)
303
-                except ValueError:
304
-                    continue
305
-
306
-            return result
307
-
308
-    @staticmethod
309
-    def _read_timestamp(line):
310
-        start = line.index('[') + 1
311
-        end = line.index(']')
312
-
313
-        if end < start:
314
-            raise ValueError
315
-
316
-        return end, float(line[start:end])
317
-
318
-    def parse_date(self, date_str):
319
-        end, timestamp = self._read_timestamp(date_str)
320
-        self._date_length = end + 1
321
-        return self.start_date + timedelta(seconds=timestamp)
322
-
323
-
324
-LOG_TYPES = [
325
-    ('logfiles', OSLogEntry),
326
-    ('logfiles_m', MsgLogEntry),
327
-    ('logfiles_t', TSLogEntry),
328
-]
280
+LOG_TYPES = {
281
+    'logfiles': OSLogParser,
282
+    'logfiles_m': MsgLogParser,
283
+    'logfiles_t': TSLogParser,
284
+}
329 285
 
330 286
 
331 287
 def process_logs(cfg):
332 288
     filename_alias = {}
333 289
     logs = []
334
-    for arg_name, entry_cls in LOG_TYPES:
290
+
291
+    paths_aliases = {}
292
+    paths_parsers = {}
293
+    for arg_name, parser_cls in LOG_TYPES.items():
335 294
         for filename in getattr(cfg, arg_name):
336
-            path, alias, is_url = get_path_and_alias(filename,
337
-                                                     cfg.log_base,
295
+            path, alias, is_url = get_path_and_alias(filename, cfg.log_base,
338 296
                                                      cfg.log_postfix)
339
-            filename_alias[path] = (filename, alias, is_url)
340
-            logs.append(LogFile.factory(entry_cls, path))
297
+            paths_aliases[path] = (filename, alias, is_url)
298
+            paths_parsers[path] = parser_cls
299
+
300
+    # NOTE(mdbooth): I feel like generate_aliases should take a single path,
301
+    # which would make this loop much tidier. I don't want to unpick it right
302
+    # now, though.
303
+    aliases = generate_aliases(paths_aliases, cfg)
341 304
 
342
-    alias = generate_aliases(filename_alias, cfg)
305
+    logs = [LogFile(path, aliases[path], parser_cls)
306
+            for path, parser_cls in paths_parsers.items()]
343 307
 
344 308
     entry_iters = [iter(log) for log in logs]
345 309
     for entry in heapq.merge(*entry_iters):
346
-        print('%s [%s] %s' % (entry.date_str, alias[entry.filename],
347
-              entry.data.rstrip('\n')))
310
+        print(entry)
348 311
 
349 312
 
350 313
 def get_path_and_alias(filename, log_base, log_postfix):

Loading…
Cancel
Save