from HTMLParser import HTMLParser from django.utils.encoding import smart_unicode from django.utils.datastructures import SortedDict from compressor.exceptions import ParserError from compressor.parser import ParserBase class HtmlParser(ParserBase, HTMLParser): def __init__(self, content): HTMLParser.__init__(self) self.content = content self._css_elems = [] self._js_elems = [] self._current_tag = None try: self.feed(self.content) self.close() except Exception, err: raise ParserError("Error while initializing HtmlParser: %s" % err) def handle_starttag(self, tag, attrs): tag = tag.lower() if tag in ('style', 'script'): if tag == 'style': tags = self._css_elems elif tag == 'script': tags = self._js_elems tags.append({ 'tag': tag, 'attrs': attrs, 'attrs_dict': dict(attrs), 'text': '' }) self._current_tag = tag elif tag == 'link': self._css_elems.append({ 'tag': tag, 'attrs': attrs, 'attrs_dict': dict(attrs), 'text': None }) def handle_endtag(self, tag): if self._current_tag and self._current_tag == tag.lower(): self._current_tag = None def handle_data(self, data): if self._current_tag == 'style': self._css_elems[-1]['text'] = data elif self._current_tag == 'script': self._js_elems[-1]['text'] = data def css_elems(self): return self._css_elems def js_elems(self): return self._js_elems def elem_name(self, elem): return elem['tag'] def elem_attribs(self, elem): return elem['attrs_dict'] def elem_content(self, elem): return smart_unicode(elem['text']) def elem_str(self, elem): tag = {} tag.update(elem) tag['attrs'] = '' if len(elem['attrs']): tag['attrs'] = ' %s' % ' '.join(['%s="%s"' % (name, value) for name, value in elem['attrs']]) if elem['tag'] == 'link': return '<%(tag)s%(attrs)s />' % tag else: return '<%(tag)s%(attrs)s>%(text)s' % tag