Browse Source

Merge "Refactor log parsing into separate parser classes"

Zuul 1 year ago
parent
commit
e2a4066918
1 changed files with 120 additions and 157 deletions
  1. 120
    157
      oslogmerger/oslogmerger.py

+ 120
- 157
oslogmerger/oslogmerger.py View File

@@ -78,85 +78,119 @@ FILE_MAP = {
78 78
 
79 79
 
80 80
 class LogEntry(object):
81
-    separator = ' '
82
-    date_format = None
83
-    _date_parse_msg = 'unconverted data remains: '
81
+    def __init__(self, alias, dt, data, dt_str=None):
82
+        self.alias = alias
83
+        self.dt = dt
84
+        self.data = data
84 85
 
85
-    def __init__(self, **kwargs):
86
-        self._date_length = None
87
-        self.__dict__.update(**kwargs)
86
+        if dt_str is not None:
87
+            self.dt_str = dt_str
88
+        else:
89
+            self.dt_str = self.dt.strftime('%Y-%m-%d %H:%M:%S.%f')
88 90
 
89
-    @classmethod
90
-    def get_init_args(cls, filename):
91
-        return {}
92
-
93
-    def prepare_line(self, line):
94
-        return line.replace('\0', ' ')
95
-
96
-    def parse_date(self, line):
97
-        try:
98
-            dt = datetime.strptime(line, self.date_format)
99
-        except ValueError as e:
100
-            if not e.args[0].startswith(self._date_parse_msg):
101
-                raise
102
-            prepared_date_length = (len(line) - len(e.args[0]) +
103
-                                    len(self._date_parse_msg))
104
-            dt = datetime.strptime(line[:prepared_date_length],
105
-                                   self.date_format)
106
-            self._date_length = prepared_date_length
107
-        return dt
108
-
109
-    def _calculate_date_length(self):
110
-        return len(self.date.strftime(self.date_format))
111
-
112
-    @property
113
-    def date_length(self):
114
-        if not self._date_length:
115
-            self._date_length = self._calculate_date_length()
116
-        return self._date_length
91
+    def append_line(self, line):
92
+        self.data += EXTRALINES_PADDING + line
117 93
 
118
-    @classmethod
119
-    def factory(cls, filename, line, **kwargs):
120
-        self = cls(**kwargs)
94
+    def __cmp__(self, other):
95
+        return cmp(self.dt, other.dt)
121 96
 
122
-        self.filename = filename
123
-        if not line:
124
-            raise ValueError
97
+    def __str__(self):
98
+        return '%s [%s] %s' % (self.dt_str, self.alias, self.data.rstrip('\n'))
125 99
 
126
-        # Prepare the line for date parsing
127
-        prepared_line = self.prepare_line(line)
128 100
 
129
-        # Extract the datetime
130
-        self.date = self.parse_date(prepared_line)
101
+class LogParser(object):
102
+    def parse_line(self, line):
103
+        raise NotImplementedError
131 104
 
132
-        if (len(line) == self.date_length or
133
-                line[self.date_length] != self.separator):
134
-            raise ValueError
135 105
 
136
-        self.date_str = line[:self.date_length]
106
+class StrptimeParser(LogParser):
107
+    date_format = None
108
+
109
+    def __init__(self, filename):
110
+        self.date_format_words = len(self.date_format.split(' '))
111
+
112
+    def parse_line(self, line):
113
+        # Split the input line into words, up to <date_format_words>. Data is
114
+        # anything after that. Join the first <date_format_words> words to
115
+        # recreate the date.
116
+        dt_str = line.split(' ', self.date_format_words)
117
+        data = dt_str.pop()
118
+        dt_str = ' '.join(dt_str)
119
+
120
+        dt = datetime.strptime(dt_str, self.date_format)
121
+
137 122
         # +1 to remove the separator so we don't have 2 spaces on output
138
-        self.data = line[self.date_length + 1:]
139
-        return self
123
+        return dt, dt_str, data
140 124
 
141
-    def append_line(self, line):
142
-        self.data += EXTRALINES_PADDING + line
143 125
 
144
-    def __cmp__(self, other):
145
-        return cmp(self.date, other.date)
126
+class OSLogParser(StrptimeParser):
127
+    """OpenStack default log: 2016-02-01 10:22:59.239"""
128
+    date_format = '%Y-%m-%d %H:%M:%S.%f'
146 129
 
147 130
 
148
-class LogFile(object):
149
-    log_entry_class = LogEntry
131
+class MsgLogParser(StrptimeParser):
132
+    """Message format: Oct 15 14:11:19"""
133
+    date_format = '%b %d %H:%M:%S'
150 134
 
151
-    @staticmethod
152
-    def factory(cls, filename):
153
-        instance = LogFile(filename)
154
-        instance.log_entry_class = cls
155
-        instance.entry_kwargs = cls.get_init_args(filename)
156
-        return instance
135
+    def __init__(self, filename):
136
+        super(MsgLogParser, self).__init__(filename)
137
+        stat = os.stat(filename)
138
+
139
+        # TODO: handle the case where log file was closed after a year boundary
140
+        log_modified = datetime.fromtimestamp(stat.st_mtime)
141
+        self.year = log_modified.year
142
+
143
+    def parse_line(self, line):
144
+        dt, dt_str, data = super(MsgLogParser, self).parse_line(line)
145
+        return dt.replace(self.year), dt_str, data
146
+
147
+
148
+class TSLogParser(LogParser):
149
+    """Timestamped log: [275514.814982]"""
157 150
 
158 151
     def __init__(self, filename):
152
+        stat = os.stat(filename)
153
+        mtime = datetime.fromtimestamp(stat.st_mtime)
154
+        timestamp = self._get_last_timestamp(filename)
155
+        self.start_date = mtime - timedelta(seconds=timestamp)
156
+
157
+    @classmethod
158
+    def _get_last_timestamp(cls, filename):
159
+        result = None
160
+        with open(filename, 'r') as f:
161
+            file_size = os.fstat(f.fileno()).st_size
162
+            # We will jump to the last KB so we don't have to read all file
163
+            offset = max(0, file_size - 1024)
164
+            f.seek(offset)
165
+            for line in f:
166
+                try:
167
+                    __, result = cls._read_timestamp(line)
168
+                except ValueError:
169
+                    continue
170
+
171
+            return result
172
+
173
+    @staticmethod
174
+    def _read_timestamp(line):
175
+        start = line.index('[') + 1
176
+        end = line.index(']')
177
+
178
+        if end < start:
179
+            raise ValueError
180
+
181
+        return end, float(line[start:end])
182
+
183
+    def parse_line(self, line):
184
+        end, timestamp = self._read_timestamp(line)
185
+        dt = self.start_date + timedelta(seconds=timestamp)
186
+        return dt, line[:end + 1], line[end + 1:]
187
+
188
+
189
+class LogFile(object):
190
+    def __init__(self, filename, alias, parser_cls):
159 191
         self.open(filename)
192
+        self.alias = alias
193
+        self.parser = parser_cls(filename)
160 194
 
161 195
     def open(self, filename):
162 196
         self._filename = filename
@@ -164,8 +198,6 @@ class LogFile(object):
164 198
             filename = self._cached_download(filename)
165 199
 
166 200
         self._file = open(filename, 'r')
167
-        stat = os.stat(filename)
168
-        self.mtime = datetime.fromtimestamp(stat.st_mtime)
169 201
 
170 202
     def _url_cache_path(self, url):
171 203
         md5 = hashlib.md5()
@@ -207,18 +239,16 @@ class LogFile(object):
207 239
             line = self._file.readline()
208 240
             if line == "":
209 241
                 return entry, None
242
+            line.replace('\0', ' ')
210 243
 
211 244
             try:
212
-                new_entry = self.log_entry_class.factory(self._filename,
213
-                                                         line,
214
-                                                         **self.entry_kwargs)
215
-                if new_entry is None:
216
-                    continue
245
+                dt, dt_str, data = self.parser.parse_line(line)
246
+                new_entry = LogEntry(self.alias, dt, data, dt_str=dt_str)
217 247
                 if entry:
218 248
                     return entry, new_entry
219 249
                 entry = new_entry
220 250
 
221
-            except Exception:
251
+            except ValueError:
222 252
                 # it's probably a non-dated line, or a garbled entry, just
223 253
                 # append to the entry extra info
224 254
                 if entry:
@@ -247,104 +277,37 @@ class LogFile(object):
247 277
         return cmp(self.peek(), other.peek())
248 278
 
249 279
 
250
-class MsgLogEntry(LogEntry):
251
-    """Message format: Oct 15 14:11:19"""
252
-    date_format = '%Y%b %d %H:%M:%S'
253
-
254
-    @classmethod
255
-    def get_init_args(cls, filename):
256
-        kwargs = super(MsgLogEntry, cls).get_init_args(filename)
257
-        stat = os.stat(filename)
258
-        kwargs['file_year'] = datetime.fromtimestamp(stat.st_mtime).year
259
-        return kwargs
260
-
261
-    def prepare_line(self, line):
262
-        # TODO: If year of file creation and file last modification are
263
-        # different we should start with the cration year and then change to
264
-        # the next year once the months go back.
265
-        line = super(MsgLogEntry, self).prepare_line(line)
266
-        return '%s%s' % (self.file_year, line)
267
-
268
-    def _calculate_date_length(self):
269
-        return super(MsgLogEntry, self)._calculate_date_length() - 4
270
-
271
-
272
-class OSLogEntry(LogEntry):
273
-    """OpenStack default log: 2016-02-01 10:22:59.239"""
274
-    date_format = '%Y-%m-%d %H:%M:%S.%f'
275
-
276
-    def _calculate_date_length(self):
277
-        return super(OSLogEntry, self)._calculate_date_length() - 3
278
-
279
-
280
-class TSLogEntry(LogEntry):
281
-    """Timestamped log: [275514.814982]"""
282
-
283
-    @classmethod
284
-    def get_init_args(cls, filename):
285
-        kwargs = super(TSLogEntry, cls).get_init_args(filename)
286
-        stat = os.stat(filename)
287
-        mtime = datetime.fromtimestamp(stat.st_mtime)
288
-        timestamp = cls._get_last_timestamp(filename)
289
-        kwargs['start_date'] = mtime - timedelta(seconds=timestamp)
290
-        return kwargs
291
-
292
-    @classmethod
293
-    def _get_last_timestamp(cls, filename):
294
-        result = None
295
-        with open(filename, 'r') as f:
296
-            file_size = os.fstat(f.fileno()).st_size
297
-            # We will jump to the last KB so we don't have to read all file
298
-            offset = max(0, file_size - 1024)
299
-            f.seek(offset)
300
-            for line in f:
301
-                try:
302
-                    __, result = cls._read_timestamp(line)
303
-                except ValueError:
304
-                    continue
305
-
306
-            return result
307
-
308
-    @staticmethod
309
-    def _read_timestamp(line):
310
-        start = line.index('[') + 1
311
-        end = line.index(']')
312
-
313
-        if end < start:
314
-            raise ValueError
315
-
316
-        return end, float(line[start:end])
317
-
318
-    def parse_date(self, date_str):
319
-        end, timestamp = self._read_timestamp(date_str)
320
-        self._date_length = end + 1
321
-        return self.start_date + timedelta(seconds=timestamp)
322
-
323
-
324
-LOG_TYPES = [
325
-    ('logfiles', OSLogEntry),
326
-    ('logfiles_m', MsgLogEntry),
327
-    ('logfiles_t', TSLogEntry),
328
-]
280
+LOG_TYPES = {
281
+    'logfiles': OSLogParser,
282
+    'logfiles_m': MsgLogParser,
283
+    'logfiles_t': TSLogParser,
284
+}
329 285
 
330 286
 
331 287
 def process_logs(cfg):
332 288
     filename_alias = {}
333 289
     logs = []
334
-    for arg_name, entry_cls in LOG_TYPES:
290
+
291
+    paths_aliases = {}
292
+    paths_parsers = {}
293
+    for arg_name, parser_cls in LOG_TYPES.items():
335 294
         for filename in getattr(cfg, arg_name):
336
-            path, alias, is_url = get_path_and_alias(filename,
337
-                                                     cfg.log_base,
295
+            path, alias, is_url = get_path_and_alias(filename, cfg.log_base,
338 296
                                                      cfg.log_postfix)
339
-            filename_alias[path] = (filename, alias, is_url)
340
-            logs.append(LogFile.factory(entry_cls, path))
297
+            paths_aliases[path] = (filename, alias, is_url)
298
+            paths_parsers[path] = parser_cls
299
+
300
+    # NOTE(mdbooth): I feel like generate_aliases should take a single path,
301
+    # which would make this loop much tidier. I don't want to unpick it right
302
+    # now, though.
303
+    aliases = generate_aliases(paths_aliases, cfg)
341 304
 
342
-    alias = generate_aliases(filename_alias, cfg)
305
+    logs = [LogFile(path, aliases[path], parser_cls)
306
+            for path, parser_cls in paths_parsers.items()]
343 307
 
344 308
     entry_iters = [iter(log) for log in logs]
345 309
     for entry in heapq.merge(*entry_iters):
346
-        print('%s [%s] %s' % (entry.date_str, alias[entry.filename],
347
-              entry.data.rstrip('\n')))
310
+        print(entry)
348 311
 
349 312
 
350 313
 def get_path_and_alias(filename, log_base, log_postfix):

Loading…
Cancel
Save