Update regex for better HTML detection.

We need to do two things to support better HTML detection. First we must ignore case since HTML is not case sensitive. Second we allow for the first line to be a <!doctype html.* doctype declaration or a <html.* tag. Change-Id: Id83c7f07f1bd19288b5119b4f5e88ab290af336f
2015-03-16 14:19:57 -07:00 · 2015-03-16 14:19:57 -07:00 · 74b5c7c90c
commit 74b5c7c90c
parent f2b963b1a1
4 changed files with 57 additions and 1 deletions
--- a/os_loganalyze/tests/samples/sample.html
+++ b/os_loganalyze/tests/samples/sample.html
@ -0,0 +1,2 @@
+<hTML>
+</html>
--- a/os_loganalyze/tests/samples/sample_doctype.html
+++ b/os_loganalyze/tests/samples/sample_doctype.html
@ -0,0 +1,3 @@
+<!doctype html PUBLIC "-//W3C//DTD html 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+</html>
--- a/os_loganalyze/tests/test_views.py
+++ b/os_loganalyze/tests/test_views.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+Test the view generators
+"""
+
+import os_loganalyze.filter as osfilter
+import os_loganalyze.generator as osgen
+from os_loganalyze.tests import base
+import os_loganalyze.view as osview
+
+
+class TestViews(base.TestCase):
+    def get_generator(self, fname):
+        # Override base's get_generator because we don't want the full
+        # wsgi application. We just need the generator to give to Views.
+        root_path = base.samples_path(self.samples_directory)
+        kwargs = {'PATH_INFO': '/htmlify/%s' % fname}
+        logname, gen = osgen.get(self.fake_env(**kwargs), root_path)
+        flines_generator = osfilter.Filter(logname, gen)
+        return flines_generator
+
+    def test_html_detection(self):
+        gen = self.get_generator('sample.html')
+        html_view = osview.HTMLView(gen)
+        i = iter(html_view)
+        self.assertFalse(html_view.is_html)
+        # Move the generator so that the is_html flag is set
+        i.next()
+        self.assertTrue(html_view.is_html)
+
+    def test_doctype_html_detection(self):
+        gen = self.get_generator('sample_doctype.html')
+        html_view = osview.HTMLView(gen)
+        i = iter(html_view)
+        self.assertFalse(html_view.is_html)
+        # Move the generator so that the is_html flag is set
+        i.next()
+        self.assertTrue(html_view.is_html)
--- a/os_loganalyze/view.py
+++ b/os_loganalyze/view.py
@ -106,7 +106,7 @@ highlight_by_hash();
 DATE_LINE = ("<span class='%s %s'><a name='%s' class='date' href='#%s'>"
             "%s</a>%s\n</span>")
 NONDATE_LINE = "<span class='%s'>%s\n</span>"
-HTML_RE = re.compile("<html")
+HTML_RE = re.compile("<(!doctype )?html", re.IGNORECASE)
 SKIP_LINES = re.compile("</?pre>")

 # pre tags mean we're partial html and shouldn't escape