Extracted parsers into their own seperate modules. Created a html5lib based parser, also mention this in the docs.

2011-04-17 15:47:44 +02:00 · 2011-04-17 15:47:44 +02:00 · a545657400
commit a545657400
parent 1f68326b87
9 changed files with 218 additions and 119 deletions
--- a/compressor/parser.py
+++ b/compressor/parser.py
@ -1,119 +0,0 @@
-from django.utils.encoding import smart_unicode
-
-from compressor.exceptions import ParserError
-
-
-class ParserBase(object):
-    """
-    Base parser to be subclassed when creating an own parser.
-    """
-    def __init__(self, content):
-        self.content = content
-
-    def css_elems(self):
-        """
-        Return an iterable containing the css elements to handle
-        """
-        raise NotImplementedError
-
-    def js_elems(self):
-        """
-        Return an iterable containing the js elements to handle
-        """
-        raise NotImplementedError
-
-    def elem_attribs(self, elem):
-        """
-        Return the dictionary like attribute store of the given element
-        """
-        raise NotImplementedError
-
-    def elem_content(self, elem):
-        """
-        Return the content of the given element
-        """
-        raise NotImplementedError
-
-    def elem_name(self, elem):
-        """
-        Return the name of the given element
-        """
-        raise NotImplementedError
-
-    def elem_str(self, elem):
-        """
-        Return the string representation of the given elem
-        """
-        raise NotImplementedError
-
-
-class BeautifulSoupParser(ParserBase):
-    _soup = None
-
-    @property
-    def soup(self):
-        try:
-            from BeautifulSoup import BeautifulSoup
-        except ImportError, e:
-            raise ParserError("Error while initializing Parser: %s" % e)
-        if self._soup is None:
-            self._soup = BeautifulSoup(self.content)
-        return self._soup
-
-    def css_elems(self):
-        return self.soup.findAll({'link': True, 'style': True})
-
-    def js_elems(self):
-        return self.soup.findAll('script')
-
-    def elem_attribs(self, elem):
-        return dict(elem.attrs)
-
-    def elem_content(self, elem):
-        return elem.string
-
-    def elem_name(self, elem):
-        return elem.name
-
-    def elem_str(self, elem):
-        return smart_unicode(elem)
-
-
-class LxmlParser(ParserBase):
-    _tree = None
-
-    @property
-    def tree(self):
-        try:
-            from lxml import html
-            from lxml.etree import tostring
-        except ImportError, e:
-            raise ParserError("Error while initializing Parser: %s" % e)
-        if self._tree is None:
-            content = '<root>%s</root>' % self.content
-            self._tree = html.fromstring(content)
-            try:
-                ignore = tostring(self._tree, encoding=unicode)
-            except UnicodeDecodeError:
-                self._tree = html.soupparser.fromstring(content)
-        return self._tree
-
-    def css_elems(self):
-        return self.tree.xpath('link[@rel="stylesheet"]|style')
-
-    def js_elems(self):
-        return self.tree.findall('script')
-
-    def elem_attribs(self, elem):
-        return elem.attrib
-
-    def elem_content(self, elem):
-        return smart_unicode(elem.text)
-
-    def elem_name(self, elem):
-        return elem.tag
-
-    def elem_str(self, elem):
-        from lxml import etree
-        return smart_unicode(
-            etree.tostring(elem, method='html', encoding=unicode))
--- a/compressor/parser/init.py
+++ b/compressor/parser/init.py
@ -0,0 +1,5 @@
+# support legacy parser module usage
+from compressor.parser.base import ParserBase
+from compressor.parser.beautifulsoup_parser import BeautifulSoupParser
+from compressor.parser.lxml_parser import LxmlParser
+from compressor.parser.html5lib_parser import Html5LibParser
--- a/compressor/parser/base.py
+++ b/compressor/parser/base.py
@ -0,0 +1,42 @@
+class ParserBase(object):
+    """
+    Base parser to be subclassed when creating an own parser.
+    """
+    def __init__(self, content):
+        self.content = content
+
+    def css_elems(self):
+        """
+        Return an iterable containing the css elements to handle
+        """
+        raise NotImplementedError
+
+    def js_elems(self):
+        """
+        Return an iterable containing the js elements to handle
+        """
+        raise NotImplementedError
+
+    def elem_attribs(self, elem):
+        """
+        Return the dictionary like attribute store of the given element
+        """
+        raise NotImplementedError
+
+    def elem_content(self, elem):
+        """
+        Return the content of the given element
+        """
+        raise NotImplementedError
+
+    def elem_name(self, elem):
+        """
+        Return the name of the given element
+        """
+        raise NotImplementedError
+
+    def elem_str(self, elem):
+        """
+        Return the string representation of the given elem
+        """
+        raise NotImplementedError
--- a/compressor/parser/beautifulsoup_parser.py
+++ b/compressor/parser/beautifulsoup_parser.py
@ -0,0 +1,34 @@
+from compressor.exceptions import ParserError
+from compressor.parser import ParserBase
+from django.utils.encoding import smart_unicode
+
+class BeautifulSoupParser(ParserBase):
+    _soup = None
+
+    @property
+    def soup(self):
+        if self._soup is None:
+            try:
+                from BeautifulSoup import BeautifulSoup
+            except ImportError, e:
+                raise ParserError("Error while initializing Parser: %s" % e)
+            self._soup = BeautifulSoup(self.content)
+        return self._soup
+
+    def css_elems(self):
+        return self.soup.findAll({'link': True, 'style': True})
+
+    def js_elems(self):
+        return self.soup.findAll('script')
+
+    def elem_attribs(self, elem):
+        return dict(elem.attrs)
+
+    def elem_content(self, elem):
+        return elem.string
+
+    def elem_name(self, elem):
+        return elem.name
+
+    def elem_str(self, elem):
+        return smart_unicode(elem)
--- a/compressor/parser/html5lib_parser.py
+++ b/compressor/parser/html5lib_parser.py
@ -0,0 +1,51 @@
+from compressor.exceptions import ParserError
+from compressor.parser import ParserBase
+from django.utils.encoding import smart_unicode
+
+try:
+    import html5lib
+except ImortError:
+    html5lib = None
+
+def _serialize(el):
+    fragment = html5lib.treebuilders.simpletree.DocumentFragment()
+    fragment.appendChild(el)
+    return html5lib.serialize(fragment, quote_attr_values=True,
+        omit_optional_tags=False)
+
+def _find(tree, *names):
+    for node in tree.childNodes:
+        if node.type == 5 and node.name in names:
+            yield node
+
+class Html5LibParser(ParserBase):
+    _html = None
+
+    @property
+    def html(self):
+        if self._html is None:
+            try:
+                import html5lib
+                self._html = html5lib.parseFragment(self.content)
+            except Exception, e:
+                raise ParserError("Error while initializing Parser: %s" % e)
+        return self._html
+
+
+    def css_elems(self):
+        return _find(self.html, 'style', 'link')
+
+    def js_elems(self):
+        return _find(self.html, 'script')
+
+    def elem_attribs(self, elem):
+        return elem.attributes
+
+    def elem_content(self, elem):
+        return elem.childNodes[0].value
+
+    def elem_name(self, elem):
+        return elem.name
+
+    def elem_str(self, elem):
+        return smart_unicode(_serialize(elem))
--- a/compressor/parser/lxml_parser.py
+++ b/compressor/parser/lxml_parser.py
@ -0,0 +1,43 @@
+from compressor.exceptions import ParserError
+from compressor.parser import ParserBase
+
+from django.utils.encoding import smart_unicode
+
+class LxmlParser(ParserBase):
+    _tree = None
+
+    @property
+    def tree(self):
+        try:
+            from lxml import html
+            from lxml.etree import tostring
+        except ImportError, e:
+            raise ParserError("Error while initializing Parser: %s" % e)
+        if self._tree is None:
+            content = '<root>%s</root>' % self.content
+            self._tree = html.fromstring(content)
+            try:
+                ignore = tostring(self._tree, encoding=unicode)
+            except UnicodeDecodeError:
+                self._tree = html.soupparser.fromstring(content)
+        return self._tree
+
+    def css_elems(self):
+        return self.tree.xpath('link[@rel="stylesheet"]|style')
+
+    def js_elems(self):
+        return self.tree.findall('script')
+
+    def elem_attribs(self, elem):
+        return elem.attrib
+
+    def elem_content(self, elem):
+        return smart_unicode(elem.text)
+
+    def elem_name(self, elem):
+        return elem.tag
+
+    def elem_str(self, elem):
+        from lxml import etree
+        return smart_unicode(
+            etree.tostring(elem, method='html', encoding=unicode))
--- a/compressor/tests/tests.py
+++ b/compressor/tests/tests.py
@ -8,6 +8,11 @@ try:
 except ImportError:
    lxml = None

+try:
+    import html5lib
+except ImportError:
+    html5lib = None
+
 from django.core.cache.backends import dummy
 from django.core.files.storage import get_storage_class
 from django.template import Template, Context, TemplateSyntaxError
@ -150,6 +155,34 @@ if lxml:
        def tearDown(self):
            settings.COMPRESS_PARSER = self.old_parser

+if html5lib:
+    class Html5LibCompressorTesCase(CompressorTestCase):
+
+        def test_css_split(self):
+            out = [
+                ('file', os.path.join(settings.COMPRESS_ROOT, u'css/one.css'), u'<link charset="utf-8" href="/media/css/one.css" rel="stylesheet" type="text/css">'),
+                ('hunk', u'p { border:5px solid green;}', u'<style type="text/css">p { border:5px solid green;}</style>'),
+                ('file', os.path.join(settings.COMPRESS_ROOT, u'css/two.css'), u'<link charset="utf-8" href="/media/css/two.css" rel="stylesheet" type="text/css">'),
+            ]
+            split = self.css_node.split_contents()
+            split = [(x[0], x[1], self.css_node.parser.elem_str(x[2])) for x in split]
+            self.assertEqual(out, split)
+
+        def test_js_split(self):
+            out = [('file', os.path.join(settings.COMPRESS_ROOT, u'js/one.js'), u'<script charset="utf-8" src="/media/js/one.js" type="text/javascript"></script>'),
+             ('hunk', u'obj.value = "value";', u'<script charset="utf-8" type="text/javascript">obj.value = "value";</script>')
+             ]
+            split = self.js_node.split_contents()
+            split = [(x[0], x[1], self.js_node.parser.elem_str(x[2])) for x in split]
+            self.assertEqual(out, split)
+
+        def setUp(self):
+            self.old_parser = settings.COMPRESS_PARSER
+            settings.COMPRESS_PARSER = 'compressor.parser.Html5LibParser'
+            super(Html5LibCompressorTesCase, self).setUp()
+
+        def tearDown(self):
+            settings.COMPRESS_PARSER = self.old_parser

 class CssAbsolutizingTestCase(TestCase):
    def setUp(self):
--- a/docs/installation.txt
+++ b/docs/installation.txt
@ -51,9 +51,18 @@ for the optional :ref:`parser <compress_parser>`

    STATIC_DEPS=true pip install lxml

+html5lib_ (optional)
+^^^^^^^^^^^^^^^^^^^^
+
+for the optional :ref:`parser <compress_parser>`
+``compressor.parser.Html5LibParser``::
+
+    pip install html5lib
+
 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
 .. _lxml: http://codespeak.net/lxml/
 .. _libxml2: http://xmlsoft.org/
+.. _html5lib: html5lib_ (optional)

 Deprecation
 -----------
--- a/docs/settings.txt
+++ b/docs/settings.txt
@ -171,6 +171,7 @@ The backends included in Django Compressor:

 - ``compressor.parser.BeautifulSoupParser``
 - ``compressor.parser.LxmlParser``
+- ``compressor.parser.Html5LibParser``

 See :ref:`dependencies` for more info about the packages you need
 for each parser.