Allow overriding file encoding

Chardet doesn't always seem to correctly detect files encoding correctly in all circumstances, to make it so that a user can specify the exact encoding of there files allow a new config option and a new CLI option that allows for manually overriding the encoding that chardet will try to determine. If enabled chardet detection will no longer run. Fixes bug 1384463 Change-Id: Ie8baf3f79083e1495c7420a9d0569390cad2115e
2014-10-22 16:13:33 -07:00 · 2014-10-22 16:13:33 -07:00 · 04a710c687
parent 8b8f22329b
commit 04a710c687
3 changed files with 35 additions and 1 deletions
--- a/README.rst
+++ b/README.rst
@ -59,6 +59,10 @@ Command line usage
      --default-extension extension
                            Default file extension to use when a file is found
                            without a file extension.
+      --file-encoding encoding
+                            Override encoding to use when attempting to determine
+                            an input files text encoding (providing this avoids
+                            using `chardet` to automatically detect encoding/s)
      --max-line-length int
                            maximum allowed line length (default: 79)
      -e extension, --extension extension
@ -110,6 +114,7 @@ Option                 Overrides    Merges
 ``ignore-path``        No           Yes
 ``ignore``             No           Yes
 ``max-line-length``    Yes          No
+``file-encoding``      Yes          No
 ``sphinx``             Yes          No
 =====================  ===========  ========

--- a/doc8/main.py
+++ b/doc8/main.py
@ -109,6 +109,10 @@ def extract_config(args):
        cfg['verbose'] = parser.getboolean("doc8", "verbose")
    except (configparser.NoSectionError, configparser.NoOptionError):
        pass
+    try:
+        cfg['file_encoding'] = parser.get("doc8", "file-encoding")
+    except (configparser.NoSectionError, configparser.NoOptionError):
+        pass
    try:
        cfg['default_extension'] = parser.get("doc8", "default-extension")
    except (configparser.NoSectionError, configparser.NoOptionError):
@ -160,6 +164,7 @@ def scan(cfg):
    file_iter = utils.find_files(cfg.get('paths', []),
                                 cfg.get('extension', []), ignored_paths)
    default_extension = cfg.get('default_extension')
+    file_encoding = cfg.get('file_encoding')
    for filename, ignoreable in file_iter:
        if ignoreable:
            files_ignored += 1
@ -167,7 +172,8 @@ def scan(cfg):
                print("  Ignoring '%s'" % (filename))
        else:
            f = file_parser.parse(filename,
-                                  default_extension=default_extension)
+                                  default_extension=default_extension,
+                                  encoding=file_encoding)
            files.append(f)
            if cfg.get('verbose'):
                print("  Selecting '%s'" % (filename))
@ -275,6 +281,13 @@ def main():
                             " found without a file extension.",
                        default='', dest='default_extension',
                        metavar='extension')
+    parser.add_argument("--file-encoding", action="store",
+                        help="Override encoding to use when attempting"
+                             " to determine an input files text encoding "
+                             "(providing this avoids using `chardet` to"
+                             " automatically detect encoding/s)",
+                        default='', dest='file_encoding',
+                        metavar='encoding')
    parser.add_argument("--max-line-length", action="store", metavar="int",
                        type=int,
                        help="Maximum allowed line"
--- a/doc8/tests/test_checks.py
+++ b/doc8/tests/test_checks.py
@ -89,6 +89,22 @@ test
                (line, code, msg) = errors[0]
                self.assertIn(code, check.REPORTS)

+    def test_correct_length(self):
+        conf = {
+            'max_line_length': 79,
+            'allow_long_titles': True,
+        }
+        with tempfile.NamedTemporaryFile(suffix='.rst') as fh:
+            fh.write(b'known exploit in the wild, for example'
+                     ' \xe2\x80\x93 the time'
+                     ' between advance notification')
+            fh.flush()
+
+            parsed_file = parser.ParsedFile(fh.name, encoding='utf-8')
+            check = checks.CheckMaxLineLength(conf)
+            errors = list(check.report_iter(parsed_file))
+            self.assertEqual(0, len(errors))
+
    def test_unsplittable_length(self):
        content = """
 ===