Detect the encoding using chardet
Instead of always assuming utf-8 (which is typically ok) switch to using the chardet library and use it to detect the incoming files encoding in a more accurate manner. Change-Id: I907c8f4e1a6682138b220b49fcfcff6eef85a0b3
This commit is contained in:
parent
8ea62aff22
commit
687addf9b0
|
@ -17,6 +17,7 @@
|
|||
import errno
|
||||
import os
|
||||
|
||||
import chardet
|
||||
from docutils import frontend
|
||||
from docutils import parsers as docutils_parser
|
||||
from docutils import utils
|
||||
|
@ -25,25 +26,13 @@ import six
|
|||
|
||||
|
||||
class ParsedFile(object):
|
||||
def __init__(self, filename, encoding='utf8'):
|
||||
def __init__(self, filename, encoding=None):
|
||||
self._filename = filename
|
||||
self._content = None
|
||||
self._raw_content = None
|
||||
self._encoding = encoding
|
||||
self._doc = None
|
||||
self._errors = None
|
||||
self._defaults = {
|
||||
'input_encoding': self._encoding,
|
||||
'halt_level': 5,
|
||||
'report_level': 5,
|
||||
'quiet': True,
|
||||
'file_insertion_enabled': False,
|
||||
'traceback': True,
|
||||
# Development use only.
|
||||
'dump_settings': False,
|
||||
'dump_internals': False,
|
||||
'dump_transforms': False,
|
||||
}
|
||||
|
||||
@property
|
||||
def errors(self):
|
||||
|
@ -61,7 +50,17 @@ class ParsedFile(object):
|
|||
# mature).
|
||||
parser_cls = docutils_parser.get_parser_class("rst")
|
||||
parser = parser_cls()
|
||||
defaults = dict(self._defaults)
|
||||
defaults = {
|
||||
'halt_level': 5,
|
||||
'report_level': 5,
|
||||
'quiet': True,
|
||||
'file_insertion_enabled': False,
|
||||
'traceback': True,
|
||||
# Development use only.
|
||||
'dump_settings': False,
|
||||
'dump_internals': False,
|
||||
'dump_transforms': False,
|
||||
}
|
||||
opt = frontend.OptionParser(components=[parser], defaults=defaults)
|
||||
doc = utils.new_document(source_path=self.filename,
|
||||
settings=opt.get_default_values())
|
||||
|
@ -83,6 +82,8 @@ class ParsedFile(object):
|
|||
|
||||
@property
|
||||
def encoding(self):
|
||||
if self._encoding is None:
|
||||
self._encoding = chardet.detect(self.raw_contents)['encoding']
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
|
@ -100,7 +101,7 @@ class ParsedFile(object):
|
|||
return self._content
|
||||
|
||||
|
||||
def parse(filename, encoding="utf8"):
|
||||
def parse(filename, encoding=None):
|
||||
if not os.path.isfile(filename):
|
||||
raise IOError(errno.ENOENT, 'File not found', filename)
|
||||
return ParsedFile(filename, encoding=encoding)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
argparse
|
||||
chardet
|
||||
docutils
|
||||
restructuredtext-lint>=0.7
|
||||
six
|
||||
|
|
Loading…
Reference in New Issue