Files
deb-python-django-compressor/compressor/parser.py
2011-03-30 12:10:39 +02:00

120 lines
3.0 KiB
Python

from django.utils.encoding import smart_unicode
from compressor.exceptions import ParserError
class ParserBase(object):
"""
Base parser to be subclassed when creating an own parser.
"""
def __init__(self, content):
self.content = content
def css_elems(self):
"""
Return an iterable containing the css elements to handle
"""
raise NotImplementedError
def js_elems(self):
"""
Return an iterable containing the js elements to handle
"""
raise NotImplementedError
def elem_attribs(self, elem):
"""
Return the dictionary like attribute store of the given element
"""
raise NotImplementedError
def elem_content(self, elem):
"""
Return the content of the given element
"""
raise NotImplementedError
def elem_name(self, elem):
"""
Return the name of the given element
"""
raise NotImplementedError
def elem_str(self, elem):
"""
Return the string representation of the given elem
"""
raise NotImplementedError
class BeautifulSoupParser(ParserBase):
_soup = None
@property
def soup(self):
try:
from BeautifulSoup import BeautifulSoup
except ImportError, e:
raise ParserError("Error while initializing Parser: %s" % e)
if self._soup is None:
self._soup = BeautifulSoup(self.content)
return self._soup
def css_elems(self):
return self.soup.findAll({'link': True, 'style': True})
def js_elems(self):
return self.soup.findAll('script')
def elem_attribs(self, elem):
return dict(elem.attrs)
def elem_content(self, elem):
return elem.string
def elem_name(self, elem):
return elem.name
def elem_str(self, elem):
return smart_unicode(elem)
class LxmlParser(ParserBase):
_tree = None
@property
def tree(self):
try:
from lxml import html
from lxml.etree import tostring
except ImportError, e:
raise ParserError("Error while initializing Parser: %s" % e)
if self._tree is None:
content = '<root>%s</root>' % self.content
self._tree = html.fromstring(content)
try:
ignore = tostring(self._tree, encoding=unicode)
except UnicodeDecodeError:
self._tree = html.soupparser.fromstring(content)
return self._tree
def css_elems(self):
return self.tree.xpath('link[@rel="stylesheet"]|style')
def js_elems(self):
return self.tree.findall('script')
def elem_attribs(self, elem):
return elem.attrib
def elem_content(self, elem):
return smart_unicode(elem.text)
def elem_name(self, elem):
return elem.tag
def elem_str(self, elem):
from lxml import etree
return smart_unicode(
etree.tostring(elem, method='html', encoding=unicode))