Update regex for better HTML detection.

We need to do two things to support better HTML detection. First we must
ignore case since HTML is not case sensitive. Second we allow for the
first line to be a <!doctype html.* doctype declaration or a <html.* tag.

Change-Id: Id83c7f07f1bd19288b5119b4f5e88ab290af336f
This commit is contained in:
Clark Boylan 2015-03-16 14:19:57 -07:00
parent f2b963b1a1
commit 74b5c7c90c
4 changed files with 57 additions and 1 deletions

View File

@ -0,0 +1,2 @@
<hTML>
</html>

View File

@ -0,0 +1,3 @@
<!doctype html PUBLIC "-//W3C//DTD html 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
</html>

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
Test the view generators
"""
import os_loganalyze.filter as osfilter
import os_loganalyze.generator as osgen
from os_loganalyze.tests import base
import os_loganalyze.view as osview
class TestViews(base.TestCase):
def get_generator(self, fname):
# Override base's get_generator because we don't want the full
# wsgi application. We just need the generator to give to Views.
root_path = base.samples_path(self.samples_directory)
kwargs = {'PATH_INFO': '/htmlify/%s' % fname}
logname, gen = osgen.get(self.fake_env(**kwargs), root_path)
flines_generator = osfilter.Filter(logname, gen)
return flines_generator
def test_html_detection(self):
gen = self.get_generator('sample.html')
html_view = osview.HTMLView(gen)
i = iter(html_view)
self.assertFalse(html_view.is_html)
# Move the generator so that the is_html flag is set
i.next()
self.assertTrue(html_view.is_html)
def test_doctype_html_detection(self):
gen = self.get_generator('sample_doctype.html')
html_view = osview.HTMLView(gen)
i = iter(html_view)
self.assertFalse(html_view.is_html)
# Move the generator so that the is_html flag is set
i.next()
self.assertTrue(html_view.is_html)

View File

@ -106,7 +106,7 @@ highlight_by_hash();
DATE_LINE = ("<span class='%s %s'><a name='%s' class='date' href='#%s'>"
"%s</a>%s\n</span>")
NONDATE_LINE = "<span class='%s'>%s\n</span>"
HTML_RE = re.compile("<html")
HTML_RE = re.compile("<(!doctype )?html", re.IGNORECASE)
SKIP_LINES = re.compile("</?pre>")
# pre tags mean we're partial html and shouldn't escape