diff --git a/compressor/parser.py b/compressor/parser.py
deleted file mode 100644
index 8ac09a6..0000000
--- a/compressor/parser.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from django.utils.encoding import smart_unicode
-
-from compressor.exceptions import ParserError
-
-
-class ParserBase(object):
- """
- Base parser to be subclassed when creating an own parser.
- """
- def __init__(self, content):
- self.content = content
-
- def css_elems(self):
- """
- Return an iterable containing the css elements to handle
- """
- raise NotImplementedError
-
- def js_elems(self):
- """
- Return an iterable containing the js elements to handle
- """
- raise NotImplementedError
-
- def elem_attribs(self, elem):
- """
- Return the dictionary like attribute store of the given element
- """
- raise NotImplementedError
-
- def elem_content(self, elem):
- """
- Return the content of the given element
- """
- raise NotImplementedError
-
- def elem_name(self, elem):
- """
- Return the name of the given element
- """
- raise NotImplementedError
-
- def elem_str(self, elem):
- """
- Return the string representation of the given elem
- """
- raise NotImplementedError
-
-
-class BeautifulSoupParser(ParserBase):
- _soup = None
-
- @property
- def soup(self):
- try:
- from BeautifulSoup import BeautifulSoup
- except ImportError, e:
- raise ParserError("Error while initializing Parser: %s" % e)
- if self._soup is None:
- self._soup = BeautifulSoup(self.content)
- return self._soup
-
- def css_elems(self):
- return self.soup.findAll({'link': True, 'style': True})
-
- def js_elems(self):
- return self.soup.findAll('script')
-
- def elem_attribs(self, elem):
- return dict(elem.attrs)
-
- def elem_content(self, elem):
- return elem.string
-
- def elem_name(self, elem):
- return elem.name
-
- def elem_str(self, elem):
- return smart_unicode(elem)
-
-
-class LxmlParser(ParserBase):
- _tree = None
-
- @property
- def tree(self):
- try:
- from lxml import html
- from lxml.etree import tostring
- except ImportError, e:
- raise ParserError("Error while initializing Parser: %s" % e)
- if self._tree is None:
- content = '%s' % self.content
- self._tree = html.fromstring(content)
- try:
- ignore = tostring(self._tree, encoding=unicode)
- except UnicodeDecodeError:
- self._tree = html.soupparser.fromstring(content)
- return self._tree
-
- def css_elems(self):
- return self.tree.xpath('link[@rel="stylesheet"]|style')
-
- def js_elems(self):
- return self.tree.findall('script')
-
- def elem_attribs(self, elem):
- return elem.attrib
-
- def elem_content(self, elem):
- return smart_unicode(elem.text)
-
- def elem_name(self, elem):
- return elem.tag
-
- def elem_str(self, elem):
- from lxml import etree
- return smart_unicode(
- etree.tostring(elem, method='html', encoding=unicode))
diff --git a/compressor/parser/__init__.py b/compressor/parser/__init__.py
new file mode 100644
index 0000000..81eba01
--- /dev/null
+++ b/compressor/parser/__init__.py
@@ -0,0 +1,5 @@
+# support legacy parser module usage
+from compressor.parser.base import ParserBase
+from compressor.parser.beautifulsoup_parser import BeautifulSoupParser
+from compressor.parser.lxml_parser import LxmlParser
+from compressor.parser.html5lib_parser import Html5LibParser
diff --git a/compressor/parser/base.py b/compressor/parser/base.py
new file mode 100644
index 0000000..8bf4dd2
--- /dev/null
+++ b/compressor/parser/base.py
@@ -0,0 +1,42 @@
+class ParserBase(object):
+ """
+ Base parser to be subclassed when creating an own parser.
+ """
+ def __init__(self, content):
+ self.content = content
+
+ def css_elems(self):
+ """
+ Return an iterable containing the css elements to handle
+ """
+ raise NotImplementedError
+
+ def js_elems(self):
+ """
+ Return an iterable containing the js elements to handle
+ """
+ raise NotImplementedError
+
+ def elem_attribs(self, elem):
+ """
+ Return the dictionary like attribute store of the given element
+ """
+ raise NotImplementedError
+
+ def elem_content(self, elem):
+ """
+ Return the content of the given element
+ """
+ raise NotImplementedError
+
+ def elem_name(self, elem):
+ """
+ Return the name of the given element
+ """
+ raise NotImplementedError
+
+ def elem_str(self, elem):
+ """
+ Return the string representation of the given elem
+ """
+ raise NotImplementedError
diff --git a/compressor/parser/beautifulsoup_parser.py b/compressor/parser/beautifulsoup_parser.py
new file mode 100644
index 0000000..1e3003e
--- /dev/null
+++ b/compressor/parser/beautifulsoup_parser.py
@@ -0,0 +1,34 @@
+from compressor.exceptions import ParserError
+from compressor.parser import ParserBase
+from django.utils.encoding import smart_unicode
+
+class BeautifulSoupParser(ParserBase):
+ _soup = None
+
+ @property
+ def soup(self):
+ if self._soup is None:
+ try:
+ from BeautifulSoup import BeautifulSoup
+ except ImportError, e:
+ raise ParserError("Error while initializing Parser: %s" % e)
+ self._soup = BeautifulSoup(self.content)
+ return self._soup
+
+ def css_elems(self):
+ return self.soup.findAll({'link': True, 'style': True})
+
+ def js_elems(self):
+ return self.soup.findAll('script')
+
+ def elem_attribs(self, elem):
+ return dict(elem.attrs)
+
+ def elem_content(self, elem):
+ return elem.string
+
+ def elem_name(self, elem):
+ return elem.name
+
+ def elem_str(self, elem):
+ return smart_unicode(elem)
diff --git a/compressor/parser/html5lib_parser.py b/compressor/parser/html5lib_parser.py
new file mode 100644
index 0000000..22c7ee6
--- /dev/null
+++ b/compressor/parser/html5lib_parser.py
@@ -0,0 +1,51 @@
+from compressor.exceptions import ParserError
+from compressor.parser import ParserBase
+from django.utils.encoding import smart_unicode
+
+try:
+ import html5lib
+except ImortError:
+ html5lib = None
+
+def _serialize(el):
+ fragment = html5lib.treebuilders.simpletree.DocumentFragment()
+ fragment.appendChild(el)
+ return html5lib.serialize(fragment, quote_attr_values=True,
+ omit_optional_tags=False)
+
+def _find(tree, *names):
+ for node in tree.childNodes:
+ if node.type == 5 and node.name in names:
+ yield node
+
+class Html5LibParser(ParserBase):
+ _html = None
+
+ @property
+ def html(self):
+ if self._html is None:
+ try:
+ import html5lib
+ self._html = html5lib.parseFragment(self.content)
+ except Exception, e:
+ raise ParserError("Error while initializing Parser: %s" % e)
+ return self._html
+
+
+ def css_elems(self):
+ return _find(self.html, 'style', 'link')
+
+ def js_elems(self):
+ return _find(self.html, 'script')
+
+ def elem_attribs(self, elem):
+ return elem.attributes
+
+ def elem_content(self, elem):
+ return elem.childNodes[0].value
+
+ def elem_name(self, elem):
+ return elem.name
+
+ def elem_str(self, elem):
+ return smart_unicode(_serialize(elem))
diff --git a/compressor/parser/lxml_parser.py b/compressor/parser/lxml_parser.py
new file mode 100644
index 0000000..ff3c875
--- /dev/null
+++ b/compressor/parser/lxml_parser.py
@@ -0,0 +1,43 @@
+from compressor.exceptions import ParserError
+from compressor.parser import ParserBase
+
+from django.utils.encoding import smart_unicode
+
+class LxmlParser(ParserBase):
+ _tree = None
+
+ @property
+ def tree(self):
+ try:
+ from lxml import html
+ from lxml.etree import tostring
+ except ImportError, e:
+ raise ParserError("Error while initializing Parser: %s" % e)
+ if self._tree is None:
+ content = '%s' % self.content
+ self._tree = html.fromstring(content)
+ try:
+ ignore = tostring(self._tree, encoding=unicode)
+ except UnicodeDecodeError:
+ self._tree = html.soupparser.fromstring(content)
+ return self._tree
+
+ def css_elems(self):
+ return self.tree.xpath('link[@rel="stylesheet"]|style')
+
+ def js_elems(self):
+ return self.tree.findall('script')
+
+ def elem_attribs(self, elem):
+ return elem.attrib
+
+ def elem_content(self, elem):
+ return smart_unicode(elem.text)
+
+ def elem_name(self, elem):
+ return elem.tag
+
+ def elem_str(self, elem):
+ from lxml import etree
+ return smart_unicode(
+ etree.tostring(elem, method='html', encoding=unicode))
diff --git a/compressor/tests/tests.py b/compressor/tests/tests.py
index 5fdcbd3..2d1538c 100644
--- a/compressor/tests/tests.py
+++ b/compressor/tests/tests.py
@@ -8,6 +8,11 @@ try:
except ImportError:
lxml = None
+try:
+ import html5lib
+except ImportError:
+ html5lib = None
+
from django.core.cache.backends import dummy
from django.core.files.storage import get_storage_class
from django.template import Template, Context, TemplateSyntaxError
@@ -150,6 +155,34 @@ if lxml:
def tearDown(self):
settings.COMPRESS_PARSER = self.old_parser
+if html5lib:
+ class Html5LibCompressorTesCase(CompressorTestCase):
+
+ def test_css_split(self):
+ out = [
+ ('file', os.path.join(settings.COMPRESS_ROOT, u'css/one.css'), u''),
+ ('hunk', u'p { border:5px solid green;}', u''),
+ ('file', os.path.join(settings.COMPRESS_ROOT, u'css/two.css'), u''),
+ ]
+ split = self.css_node.split_contents()
+ split = [(x[0], x[1], self.css_node.parser.elem_str(x[2])) for x in split]
+ self.assertEqual(out, split)
+
+ def test_js_split(self):
+ out = [('file', os.path.join(settings.COMPRESS_ROOT, u'js/one.js'), u''),
+ ('hunk', u'obj.value = "value";', u'')
+ ]
+ split = self.js_node.split_contents()
+ split = [(x[0], x[1], self.js_node.parser.elem_str(x[2])) for x in split]
+ self.assertEqual(out, split)
+
+ def setUp(self):
+ self.old_parser = settings.COMPRESS_PARSER
+ settings.COMPRESS_PARSER = 'compressor.parser.Html5LibParser'
+ super(Html5LibCompressorTesCase, self).setUp()
+
+ def tearDown(self):
+ settings.COMPRESS_PARSER = self.old_parser
class CssAbsolutizingTestCase(TestCase):
def setUp(self):
diff --git a/docs/installation.txt b/docs/installation.txt
index c563551..c2f9c72 100644
--- a/docs/installation.txt
+++ b/docs/installation.txt
@@ -51,9 +51,18 @@ for the optional :ref:`parser `
STATIC_DEPS=true pip install lxml
+html5lib_ (optional)
+^^^^^^^^^^^^^^^^^^^^
+
+for the optional :ref:`parser `
+``compressor.parser.Html5LibParser``::
+
+ pip install html5lib
+
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
.. _lxml: http://codespeak.net/lxml/
.. _libxml2: http://xmlsoft.org/
+.. _html5lib: html5lib_ (optional)
Deprecation
-----------
diff --git a/docs/settings.txt b/docs/settings.txt
index 2cd15f7..7f77c77 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -171,6 +171,7 @@ The backends included in Django Compressor:
- ``compressor.parser.BeautifulSoupParser``
- ``compressor.parser.LxmlParser``
+- ``compressor.parser.Html5LibParser``
See :ref:`dependencies` for more info about the packages you need
for each parser.