Created new parser, HtmlParser, based on the stdlib HTMLParser module.

Added AutoSelectParser, picks LxmlParser if lxml is available, falls back to HtmlParser if not, also the new default. Created a special BeautifulSoupTest in order to still test this parser. Updated README, installation and settings docs to reflect these changes.
2011-04-19 20:22:08 +02:00
parent cee8021c6b
commit b6d5131611
11 changed files with 168 additions and 51 deletions
--- a/README.rst
+++ b/README.rst
@@ -29,9 +29,10 @@ Configurability & Extendibility
 -------------------------------
 Django Compressor is highly configurable and extendible. The HTML parsing
-is done using BeautifulSoup_ by default. As an alternative Django Compressor
+is done using lxml_ or if it's not available Python's built-in HTMLParser by
-provides an lxml_ and a html5lib_ based parser, as well as an abstract base
+default. As an alternative Django Compressor provides a BeautifulSoup_ and a
-class that makes it easy to write a custom parser.
+html5lib_ based parser, as well as an abstract base class that makes it easy to
 write a custom parser.
 Django Compressor also comes with built-in support for `CSS Tidy`_,
 `YUI CSS and JS`_ compressor, the Google's `Closure Compiler`_, a Python
--- a/compressor/base.py
+++ b/compressor/base.py
@@ -88,8 +88,6 @@ class Compressor(object):
    def hunks(self):
        for kind, value, elem in self.split_contents():
            if kind == "hunk":
                # Let's cast BeautifulSoup element to unicode here since
                # it will try to encode using ascii internally later
                yield unicode(self.filter(
                    value, method="input", elem=elem, kind=kind))
            elif kind == "file":
--- a/compressor/parser/init.py
+++ b/compressor/parser/init.py
@@ -1,5 +1,31 @@
 from django.utils.functional import LazyObject
 from django.utils.importlib import import_module
 # support legacy parser module usage
 from compressor.parser.base import ParserBase
 from compressor.parser.beautifulsoup import BeautifulSoupParser
 from compressor.parser.lxml import LxmlParser
 from compressor.parser.htmlparser import HtmlParser
 from compressor.parser.beautifulsoup import BeautifulSoupParser
 from compressor.parser.html5lib import Html5LibParser
 class AutoSelectParser(LazyObject):
    options = (
        ('lxml.html', LxmlParser),  # lxml, extremely fast
        ('HTMLParser', HtmlParser), # fast and part of the Python stdlib
    )
    def __init__(self, content):
        self._wrapped = None
        self._setup(content)
    def __getattr__(self, name):
        return getattr(self._wrapped, name)
    def _setup(self, content):
        for dependency, parser in self.options:
            try:
                import_module(dependency)
                self._wrapped = parser(content)
                break
            except ImportError:
                continue
--- a/compressor/parser/html5lib.py
+++ b/compressor/parser/html5lib.py
@@ -47,4 +47,7 @@ class Html5LibParser(ParserBase):
        return elem.name
    def elem_str(self, elem):
        # This method serializes HTML in a way that does not pass all tests.
        # However, this method is only called in tests anyway, so it doesn't
        # really matter.
        return smart_unicode(self._serialize(elem))
--- a/compressor/parser/htmlparser.py
+++ b/compressor/parser/htmlparser.py
@@ -0,0 +1,77 @@
 from HTMLParser import HTMLParser
 from django.utils.encoding import smart_unicode
 from django.utils.datastructures import SortedDict
 from compressor.exceptions import ParserError
 from compressor.parser import ParserBase
 class HtmlParser(ParserBase, HTMLParser):
    def __init__(self, content):
        HTMLParser.__init__(self)
        self.content = content
        self._css_elems = []
        self._js_elems = []
        self._current_tag = None
        try:
            self.feed(self.content)
            self.close()
        except Exception, err:
            raise ParserError("Error while initializing HtmlParser: %s" % err)
    def handle_starttag(self, tag, attrs):
        tag = tag.lower()
        if tag in ('style', 'script'):
            if tag == 'style':
                tags = self._css_elems
            elif tag == 'script':
                tags = self._js_elems
            tags.append({
                'tag': tag,
                'attrs': attrs,
                'attrs_dict': dict(attrs),
                'text': ''
            })
            self._current_tag = tag
        elif tag == 'link':
            self._css_elems.append({
                'tag': tag,
                'attrs': attrs,
                'attrs_dict': dict(attrs),
                'text': None
            })
    def handle_endtag(self, tag):
        if self._current_tag and self._current_tag == tag.lower():
            self._current_tag = None
    def handle_data(self, data):
        if self._current_tag == 'style':
            self._css_elems[-1]['text'] = data
        elif self._current_tag == 'script':
            self._js_elems[-1]['text'] = data
    def css_elems(self):
        return self._css_elems
    def js_elems(self):
        return self._js_elems
    def elem_name(self, elem):
        return elem['tag']
    def elem_attribs(self, elem):
        return elem['attrs_dict']
    def elem_content(self, elem):
        return smart_unicode(elem['text'])
    def elem_str(self, elem):
        tag = {}
        tag.update(elem)
        tag['attrs'] = ''
        if len(elem['attrs']):
            tag['attrs'] = ' %s' % ' '.join(['%s="%s"' % (name, value) for name, value in elem['attrs']])
        if elem['tag'] == 'link':
            return '<%(tag)s%(attrs)s />' % tag
        else:
            return '<%(tag)s%(attrs)s>%(text)s</%(tag)s>' % tag
--- a/compressor/parser/lxml.py
+++ b/compressor/parser/lxml.py
@@ -15,6 +15,7 @@ class LxmlParser(ParserBase):
        try:
            from lxml.html import fromstring, soupparser
            from lxml.etree import tostring
            self.tostring = tostring
            tree = fromstring(content)
            try:
                ignore = tostring(tree, encoding=unicode)
@@ -43,6 +44,9 @@ class LxmlParser(ParserBase):
        return elem.tag
    def elem_str(self, elem):
-        from lxml import etree
+        elem_as_string = smart_unicode(
-        return smart_unicode(
+            self.tostring(elem, method='html', encoding=unicode))
-            etree.tostring(elem, method='html', encoding=unicode))
+        if elem.tag == 'link':
            # This makes testcases happy
            return elem_as_string.replace('>', ' />')
        return elem_as_string
--- a/compressor/settings.py
+++ b/compressor/settings.py
@@ -12,7 +12,7 @@ class CompressorSettings(AppSettings):
    # GET variable that disables compressor e.g. "nocompress"
    DEBUG_TOGGLE = "None"
    # the backend to use when parsing the JavaScript or Stylesheet files
-    PARSER = 'compressor.parser.BeautifulSoupParser'
+    PARSER = 'compressor.parser.AutoSelectParser'
    OUTPUT_DIR = 'CACHE'
    STORAGE = 'compressor.storage.CompressorFileStorage'
--- a/compressor/tests/tests.py
+++ b/compressor/tests/tests.py
@@ -15,6 +15,11 @@ try:
 except ImportError:
    html5lib = None
 try:
    from BeautifulSoup import BeautifulSoup
 except ImportError:
    BeautifulSoup = None
 from django.core.cache.backends import dummy
 from django.core.files.storage import get_storage_class
 from django.template import Template, Context, TemplateSyntaxError
@@ -31,6 +36,7 @@ from compressor.utils import find_command
 class CompressorTestCase(TestCase):
    def setUp(self):
        self.maxDiff = None
        settings.COMPRESS_ENABLED = True
        settings.COMPRESS_PRECOMPILERS = {}
        settings.COMPRESS_DEBUG_TOGGLE = 'nocompress'
@@ -136,29 +142,25 @@ class CompressorTestCase(TestCase):
        finally:
            settings.COMPRESS_OUTPUT_DIR = old_output_dir
 class LxmlCompressorTestCase(CompressorTestCase):
-    def test_css_split(self):
+class ParserTestCase(object):
        out = [
            ('file', os.path.join(settings.COMPRESS_ROOT, u'css/one.css'), u'<link rel="stylesheet" href="/media/css/one.css" type="text/css" charset="utf-8">'),
            ('hunk', u'p { border:5px solid green;}', u'<style type="text/css">p { border:5px solid green;}</style>'),
            ('file', os.path.join(settings.COMPRESS_ROOT, u'css/two.css'), u'<link rel="stylesheet" href="/media/css/two.css" type="text/css" charset="utf-8">'),
        ]
        split = self.css_node.split_contents()
        split = [(x[0], x[1], self.css_node.parser.elem_str(x[2])) for x in split]
        self.assertEqual(out, split)
    def setUp(self):
        self.old_parser = settings.COMPRESS_PARSER
-        settings.COMPRESS_PARSER = 'compressor.parser.LxmlParser'
+        settings.COMPRESS_PARSER = self.parser_cls
-        super(LxmlCompressorTestCase, self).setUp()
+        super(ParserTestCase, self).setUp()
    def tearDown(self):
        settings.COMPRESS_PARSER = self.old_parser
 LxmlCompressorTestCase = skipIf(lxml is None, 'lxml not found')(LxmlCompressorTestCase)
-class Html5LibCompressorTesCase(CompressorTestCase):
+class LxmlParserTests(ParserTestCase, CompressorTestCase):
    parser_cls = 'compressor.parser.LxmlParser'
 LxmlParserTests = skipIf(lxml is None, 'lxml not found')(LxmlParserTests)
 class Html5LibParserTests(ParserTestCase, CompressorTestCase):
    parser_cls = 'compressor.parser.Html5LibParser'
    def test_css_split(self):
        out = [
@@ -178,14 +180,19 @@ class Html5LibCompressorTesCase(CompressorTestCase):
        split = [(x[0], x[1], self.js_node.parser.elem_str(x[2])) for x in split]
        self.assertEqual(out, split)
-    def setUp(self):
+Html5LibParserTests = skipIf(
-        self.old_parser = settings.COMPRESS_PARSER
+    html5lib is None, 'html5lib not found')(Html5LibParserTests)
        settings.COMPRESS_PARSER = 'compressor.parser.Html5LibParser'
        super(Html5LibCompressorTesCase, self).setUp()
-    def tearDown(self):
+
-        settings.COMPRESS_PARSER = self.old_parser
+class BeautifulSoupParserTests(ParserTestCase, CompressorTestCase):
-Html5LibCompressorTesCase = skipIf(html5lib is None, 'html5lib not found')(Html5LibCompressorTesCase)
+    parser_cls = 'compressor.parser.BeautifulSoupParser'
 BeautifulSoupParserTests = skipIf(
    BeautifulSoup is None, 'BeautifulSoup not found')(BeautifulSoupParserTests)
 class HtmlParserTests(ParserTestCase, CompressorTestCase):
    parser_cls = 'compressor.parser.HtmlParser'
 class CssAbsolutizingTestCase(TestCase):
--- a/docs/installation.txt
+++ b/docs/installation.txt
@@ -35,10 +35,10 @@ Installation
 Dependencies
 ------------
-BeautifulSoup_
+BeautifulSoup_ (optional)
-^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^
-for the default :ref:`parser <compress_parser>`
+for the :ref:`parser <compress_parser>`
 ``compressor.parser.BeautifulSoupParser``::
    pip install BeautifulSoup
@@ -46,16 +46,15 @@ for the default :ref:`parser <compress_parser>`
 lxml_ (optional)
 ^^^^^^^^^^^^^^^^
-for the optional :ref:`parser <compress_parser>`
+for the :ref:`parser <compress_parser>` ``compressor.parser.LxmlParser``,
-``compressor.parser.LxmlParser``, also requires libxml2_::
+also requires libxml2_::
    STATIC_DEPS=true pip install lxml
 html5lib_ (optional)
 ^^^^^^^^^^^^^^^^^^^^
-for the optional :ref:`parser <compress_parser>`
+for the :ref:`parser <compress_parser>` ``compressor.parser.Html5LibParser``::
 ``compressor.parser.Html5LibParser``::
    pip install html5lib
@@ -63,10 +62,3 @@ for the optional :ref:`parser <compress_parser>`
 .. _lxml: http://codespeak.net/lxml/
 .. _libxml2: http://xmlsoft.org/
 .. _html5lib: http://code.google.com/p/html5lib/
 Deprecation
 -----------
 This section lists features and settings that are deprecated or removed
 in newer versions of Django Compressor.
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -164,13 +164,25 @@ Django Compressor ships with one additional storage backend:
 COMPRESS_PARSER
 ^^^^^^^^^^^^^^^
-:Default: ``'compressor.parser.BeautifulSoupParser'``
+:Default: ``'compressor.parser.AutoSelectParser'``
 The backend to use when parsing the JavaScript or Stylesheet files. The
 ``AutoSelectParser`` picks the ``lxml`` based parser when available, and falls
 back to ``HtmlParser`` if ``lxml`` is not available.
 ``LxmlParser`` is the fastest available parser, but ``HtmlParser`` is not much
 slower. ``AutoSelectParser`` adds a slight overhead, but in most cases it
 won't be necesarry to change the default parser.
 The other two included parsers are considerably slower and should only be
 used if absolutely necessary.
 The backend to use when parsing the JavaScript or Stylesheet files.
 The backends included in Django Compressor:
- ``compressor.parser.BeautifulSoupParser``
+- ``compressor.parser.AutoSelectParser``
 - ``compressor.parser.LxmlParser``
 - ``compressor.parser.HtmlParser``
 - ``compressor.parser.BeautifulSoupParser``
 - ``compressor.parser.Html5LibParser``
 See :ref:`dependencies` for more info about the packages you need
--- a/setup.py
+++ b/setup.py
@@ -111,9 +111,6 @@ setup(
    author_email = 'jannis@leidel.info',
    packages = find_packages(),
    package_data = find_package_data('compressor', only_in_packages=False),
    install_requires = [
        'BeautifulSoup',
    ],
    classifiers = [
        'Development Status :: 4 - Beta',
        'Framework :: Django',