From 687addf9b0b690992aa674af4c5b17bda79e4ee7 Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Sat, 16 Aug 2014 23:42:42 -0700 Subject: [PATCH] Detect the encoding using chardet Instead of always assuming utf-8 (which is typically ok) switch to using the chardet library and use it to detect the incoming files encoding in a more accurate manner. Change-Id: I907c8f4e1a6682138b220b49fcfcff6eef85a0b3 --- doc8/parser.py | 31 ++++++++++++++++--------------- requirements.txt | 1 + 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/doc8/parser.py b/doc8/parser.py index 30bb8fa..634bb66 100644 --- a/doc8/parser.py +++ b/doc8/parser.py @@ -17,6 +17,7 @@ import errno import os +import chardet from docutils import frontend from docutils import parsers as docutils_parser from docutils import utils @@ -25,25 +26,13 @@ import six class ParsedFile(object): - def __init__(self, filename, encoding='utf8'): + def __init__(self, filename, encoding=None): self._filename = filename self._content = None self._raw_content = None self._encoding = encoding self._doc = None self._errors = None - self._defaults = { - 'input_encoding': self._encoding, - 'halt_level': 5, - 'report_level': 5, - 'quiet': True, - 'file_insertion_enabled': False, - 'traceback': True, - # Development use only. - 'dump_settings': False, - 'dump_internals': False, - 'dump_transforms': False, - } @property def errors(self): @@ -61,7 +50,17 @@ class ParsedFile(object): # mature). parser_cls = docutils_parser.get_parser_class("rst") parser = parser_cls() - defaults = dict(self._defaults) + defaults = { + 'halt_level': 5, + 'report_level': 5, + 'quiet': True, + 'file_insertion_enabled': False, + 'traceback': True, + # Development use only. + 'dump_settings': False, + 'dump_internals': False, + 'dump_transforms': False, + } opt = frontend.OptionParser(components=[parser], defaults=defaults) doc = utils.new_document(source_path=self.filename, settings=opt.get_default_values()) @@ -83,6 +82,8 @@ class ParsedFile(object): @property def encoding(self): + if self._encoding is None: + self._encoding = chardet.detect(self.raw_contents)['encoding'] return self._encoding @property @@ -100,7 +101,7 @@ class ParsedFile(object): return self._content -def parse(filename, encoding="utf8"): +def parse(filename, encoding=None): if not os.path.isfile(filename): raise IOError(errno.ENOENT, 'File not found', filename) return ParsedFile(filename, encoding=encoding) diff --git a/requirements.txt b/requirements.txt index 1d7be4d..f7ea722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ argparse +chardet docutils restructuredtext-lint>=0.7 six