
Added AutoSelectParser, picks LxmlParser if lxml is available, falls back to HtmlParser if not, also the new default. Created a special BeautifulSoupTest in order to still test this parser. Updated README, installation and settings docs to reflect these changes.
78 lines
2.3 KiB
Python
78 lines
2.3 KiB
Python
from HTMLParser import HTMLParser
|
|
from django.utils.encoding import smart_unicode
|
|
from django.utils.datastructures import SortedDict
|
|
from compressor.exceptions import ParserError
|
|
from compressor.parser import ParserBase
|
|
|
|
class HtmlParser(ParserBase, HTMLParser):
|
|
|
|
def __init__(self, content):
|
|
HTMLParser.__init__(self)
|
|
self.content = content
|
|
self._css_elems = []
|
|
self._js_elems = []
|
|
self._current_tag = None
|
|
try:
|
|
self.feed(self.content)
|
|
self.close()
|
|
except Exception, err:
|
|
raise ParserError("Error while initializing HtmlParser: %s" % err)
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
tag = tag.lower()
|
|
if tag in ('style', 'script'):
|
|
if tag == 'style':
|
|
tags = self._css_elems
|
|
elif tag == 'script':
|
|
tags = self._js_elems
|
|
tags.append({
|
|
'tag': tag,
|
|
'attrs': attrs,
|
|
'attrs_dict': dict(attrs),
|
|
'text': ''
|
|
})
|
|
self._current_tag = tag
|
|
elif tag == 'link':
|
|
self._css_elems.append({
|
|
'tag': tag,
|
|
'attrs': attrs,
|
|
'attrs_dict': dict(attrs),
|
|
'text': None
|
|
})
|
|
|
|
def handle_endtag(self, tag):
|
|
if self._current_tag and self._current_tag == tag.lower():
|
|
self._current_tag = None
|
|
|
|
def handle_data(self, data):
|
|
if self._current_tag == 'style':
|
|
self._css_elems[-1]['text'] = data
|
|
elif self._current_tag == 'script':
|
|
self._js_elems[-1]['text'] = data
|
|
|
|
def css_elems(self):
|
|
return self._css_elems
|
|
|
|
def js_elems(self):
|
|
return self._js_elems
|
|
|
|
def elem_name(self, elem):
|
|
return elem['tag']
|
|
|
|
def elem_attribs(self, elem):
|
|
return elem['attrs_dict']
|
|
|
|
def elem_content(self, elem):
|
|
return smart_unicode(elem['text'])
|
|
|
|
def elem_str(self, elem):
|
|
tag = {}
|
|
tag.update(elem)
|
|
tag['attrs'] = ''
|
|
if len(elem['attrs']):
|
|
tag['attrs'] = ' %s' % ' '.join(['%s="%s"' % (name, value) for name, value in elem['attrs']])
|
|
if elem['tag'] == 'link':
|
|
return '<%(tag)s%(attrs)s />' % tag
|
|
else:
|
|
return '<%(tag)s%(attrs)s>%(text)s</%(tag)s>' % tag
|