From 51cbe4f9144f45b4cdcd5a8b348de6e867b56e28 Mon Sep 17 00:00:00 2001
From: Joshua Harlow <harlowja@gmail.com>
Date: Sun, 18 May 2014 18:12:03 -0700
Subject: [PATCH] Move over the rest of the checks

Get the rest of the checks working (the
max line length check now works).

Remove the old script now that it is not
needed anymore.
---
 doc8/checks.py | 143 +++++++++++++++++++-
 doc8/main.py   |  45 ++++++-
 doc8/utils.py  |  17 ++-
 scripts/doc8   | 360 -------------------------------------------------
 4 files changed, 201 insertions(+), 364 deletions(-)
 delete mode 100755 scripts/doc8

diff --git a/doc8/checks.py b/doc8/checks.py
index 9c71c7d..224a181 100644
--- a/doc8/checks.py
+++ b/doc8/checks.py
@@ -17,10 +17,14 @@
 # under the License.
 
 import abc
+import collections
 import re
 
+from docutils import nodes as docutils_nodes
 import six
 
+from doc8 import utils
+
 
 @six.add_metaclass(abc.ABCMeta)
 class ContentCheck(object):
@@ -74,5 +78,142 @@ class CheckCarriageReturn(LineCheck):
 class CheckMaxLineLength(ContentCheck):
     REPORTS = frozenset(["D001"])
 
+    def _extract_node_lines(self, doc):
+
+        def extract_lines(node, start_line):
+            lines = [start_line]
+            if isinstance(node, (docutils_nodes.title)):
+                start = start_line - len(node.rawsource.splitlines())
+                if start >= 0:
+                    lines.append(start)
+            if isinstance(node, (docutils_nodes.literal_block)):
+                end = start_line + len(node.rawsource.splitlines()) - 1
+                lines.append(end)
+            return lines
+
+        def gather_lines(node):
+            lines = []
+            for n in node.traverse(include_self=True):
+                lines.extend(extract_lines(n, find_line(n)))
+            return lines
+
+        def find_line(node):
+            n = node
+            while n is not None:
+                if n.line is not None:
+                    return n.line
+                n = n.parent
+            return None
+
+        def filter_systems(node):
+            if utils.has_any_node_type(node, (docutils_nodes.system_message,)):
+                return False
+            return True
+
+        nodes_lines = []
+        first_line = -1
+        for n in utils.filtered_traverse(doc, filter_systems):
+            line = find_line(n)
+            if line is None:
+                continue
+            if first_line == -1:
+                first_line = line
+            contained_lines = set(gather_lines(n))
+            nodes_lines.append((n, (min(contained_lines),
+                                    max(contained_lines))))
+        return (nodes_lines, first_line)
+
+    def _extract_directives(self, lines):
+
+        def starting_whitespace(line):
+            m = re.match(r"^(\s+)(.*)$", line)
+            if not m:
+                return 0
+            return len(m.group(1))
+
+        def all_whitespace(line):
+            return bool(re.match(r"^(\s*)$", line))
+
+        def find_directive_end(start, lines):
+            after_lines = collections.deque(lines[start + 1:])
+            k = 0
+            while after_lines:
+                line = after_lines.popleft()
+                if all_whitespace(line) or starting_whitespace(line) >= 1:
+                    k += 1
+                else:
+                    break
+            return start + k
+
+        # Find where directives start & end so that we can exclude content in
+        # these directive regions (the rst parser may not handle this correctly
+        # for unknown directives, so we have to do it manually).
+        directives = []
+        for i, line in enumerate(lines):
+            if re.match(r"^..\s(.*?)::\s*", line):
+                directives.append((i, find_directive_end(i, lines)))
+            elif re.match(r"^::\s*$", line):
+                directives.append((i, find_directive_end(i, lines)))
+        return directives
+
     def report_iter(self, parsed_file):
-        pass
+        doc = parsed_file.document
+        lines = list(parsed_file.lines_iter())
+
+        nodes_lines, first_line = self._extract_node_lines(doc)
+        directives = self._extract_directives(lines)
+
+        def find_containing_nodes(num):
+            if num < first_line and len(nodes_lines):
+                return [nodes_lines[0][0]]
+            contained_in = []
+            for (n, (line_min, line_max)) in nodes_lines:
+                if num >= line_min and num <= line_max:
+                    contained_in.append((n, (line_min, line_max)))
+            smallest_span = None
+            best_nodes = []
+            for (n, (line_min, line_max)) in contained_in:
+                span = line_max - line_min
+                if smallest_span is None:
+                    smallest_span = span
+                    best_nodes = [n]
+                elif span < smallest_span:
+                    smallest_span = span
+                    best_nodes = [n]
+                elif span == smallest_span:
+                    best_nodes.append(n)
+            return best_nodes
+
+        def any_types(nodes, types):
+            return any([isinstance(n, types) for n in nodes])
+
+        skip_types = (
+            docutils_nodes.target,
+            docutils_nodes.literal_block,
+        )
+        title_types = (
+            docutils_nodes.title,
+        )
+        max_line_length = self._cfg['max_line_length']
+        allow_long_titles = self._cfg['allow_long_titles']
+        for i, line in enumerate(lines):
+            if len(line) > max_line_length:
+                in_directive = False
+                for (start, end) in directives:
+                    if i >= start and i <= end:
+                        in_directive = True
+                        break
+                if in_directive:
+                    continue
+                stripped = line.lstrip()
+                if ' ' not in stripped:
+                    # No room to split even if we could.
+                    continue
+                if utils.contains_url(stripped):
+                    continue
+                nodes = find_containing_nodes(i + 1)
+                if any_types(nodes, skip_types):
+                    continue
+                if allow_long_titles and any_types(nodes, title_types):
+                    continue
+                yield (i + 1, 'D001', 'Line too long')
diff --git a/doc8/main.py b/doc8/main.py
index 54144c0..cff4ae8 100644
--- a/doc8/main.py
+++ b/doc8/main.py
@@ -35,6 +35,7 @@ import os
 
 from six.moves import configparser
 
+from doc8 import checks
 from doc8 import parser as file_parser
 from doc8 import utils
 
@@ -88,6 +89,15 @@ def extract_config(args):
     return cfg
 
 
+def fetch_checks(cfg):
+    return [
+        checks.CheckTrailingWhitespace(cfg),
+        checks.CheckIndentationNoTab(cfg),
+        checks.CheckCarriageReturn(cfg),
+        checks.CheckMaxLineLength(cfg),
+    ]
+
+
 def main():
     parser = argparse.ArgumentParser(
         description=__doc__,
@@ -121,5 +131,36 @@ def main():
     args.update(cfg)
 
     files = []
-    for filename in utils.find_files(args['paths'], FILE_PATTERNS):
-        files.append(file_parser.parse(filename))
\ No newline at end of file
+    for filename in utils.find_files(args.pop('paths', []), FILE_PATTERNS):
+        files.append(file_parser.parse(filename))
+
+    ignoreables = frozenset(args.pop('ignore', []))
+    errors = 0
+    for f in files:
+        for c in fetch_checks(args):
+            try:
+                reports = set(c.REPORTS)
+            except AttributeError:
+                pass
+            else:
+                reports = reports - ignoreables
+                if not reports:
+                    continue
+            if isinstance(c, checks.ContentCheck):
+                for line_num, code, message in c.report_iter(f):
+                    print('%s:%s: %s %s'
+                          % (f.filename, line_num, code, message))
+                    errors += 1
+            elif isinstance(c, checks.LineCheck):
+                for line_num, line in enumerate(f.lines_iter(), 1):
+                    for code, message in c.report_iter(line):
+                        print('%s:%s: %s %s'
+                              % (f.filename, line_num, code, message))
+                        errors += 1
+            else:
+                raise TypeError("Unknown check type: %s, %s"
+                                % (type(c), c))
+    if errors:
+        return 1
+    else:
+        return 0
\ No newline at end of file
diff --git a/doc8/utils.py b/doc8/utils.py
index a05a75d..74d2af7 100644
--- a/doc8/utils.py
+++ b/doc8/utils.py
@@ -34,7 +34,22 @@ def find_files(paths, patterns):
             raise IOError('Invalid path: %s' % path)
 
 
-def filter_document(document, filter_func):
+def filtered_traverse(document, filter_func):
     for n in document.traverse(include_self=True):
         if filter_func(n):
             yield n
+
+
+def contains_url(line):
+    if "http://" in line or "https://" in line:
+        return True
+    return False
+
+
+def has_any_node_type(node, node_types):
+    n = node
+    while n is not None:
+        if isinstance(n, node_types):
+            return True
+        n = n.parent
+    return False
diff --git a/scripts/doc8 b/scripts/doc8
deleted file mode 100755
index f31fac2..0000000
--- a/scripts/doc8
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Copyright (C) 2014 Ivan Melnikov <iv at altlinux dot org>
-#
-# Author: Joshua Harlow <harlowja@yahoo-inc.com>
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-
-"""Check documentation for simple style requirements.
-
-What is checked:
-    - lines should not be longer than 79 characters - D001
-      - exception: line with no whitespace except in the beginning
-      - exception: lines with http or https urls
-      - exception: literal blocks
-      - exception: rst target directives
-    - no trailing whitespace - D002
-    - no tabulation for indentation - D003
-    - no carriage returns (use unix newlines) - D004
-"""
-
-import argparse
-import collections
-import fnmatch
-import os
-import re
-import sys
-
-from docutils import frontend
-from docutils import nodes as doc_nodes
-from docutils.parsers import rst
-from docutils import utils
-
-import six
-from six.moves import configparser
-
-
-FILE_PATTERNS = ['*.rst', '*.txt']
-MAX_LINE_LENGTH = 79
-TRAILING_WHITESPACE_REGEX = re.compile('\s$')
-STARTING_WHITESPACE_REGEX = re.compile('^(\s+)')
-CONFIG_FILENAMES = [
-    "doc8.ini",
-    "tox.ini",
-    "pep8.ini",
-    "setup.cfg",
-]
-
-
-def check_max_length(fn, cfg, contents):
-
-    def contains_url(line):
-        if "http://" in line or "https://" in line:
-            return True
-        return False
-
-    def any_node_type(node, node_types):
-        n = node
-        node_types = tuple(node_types)
-        while n is not None:
-            if isinstance(n, node_types):
-                return True
-            n = n.parent
-        return False
-
-    def extract_lines(node, start_line):
-        lines = [start_line]
-        if isinstance(node, (doc_nodes.title)):
-            start = start_line - len(node.rawsource.splitlines())
-            if start >= 0:
-                lines.append(start)
-        if isinstance(node, (doc_nodes.literal_block)):
-            end = start_line + len(node.rawsource.splitlines()) - 1
-            lines.append(end)
-        return lines
-
-    def gather_lines(node):
-        lines = []
-        for n in node.traverse(include_self=True):
-            lines.extend(extract_lines(n, find_line(n)))
-        return lines
-
-    def find_line(node):
-        n = node
-        while n is not None:
-            if n.line is not None:
-                return n.line
-            n = n.parent
-        return None
-
-    def find_containing_nodes(num, node_lines, first_line):
-        if num < first_line and len(node_lines):
-            return [node_lines[0][0]]
-        contained_in = []
-        for (n, (line_min, line_max)) in node_lines:
-            if num >= line_min and num <= line_max:
-                contained_in.append((n, (line_min, line_max)))
-        smallest_span = None
-        best_nodes = []
-        for (n, (line_min, line_max)) in contained_in:
-            span = line_max - line_min
-            if smallest_span is None:
-                smallest_span = span
-                best_nodes = [n]
-            elif span < smallest_span:
-                smallest_span = span
-                best_nodes = [n]
-            elif span == smallest_span:
-                best_nodes.append(n)
-        return best_nodes
-
-    def find_directive_end(start, lines):
-
-        def starting_whitespace(line):
-            m = re.match(r"^(\s+)(.*)$", line)
-            if not m:
-                return 0
-            return len(m.group(1))
-
-        def all_whitespace(line):
-            return bool(re.match(r"^(\s*)$", line))
-
-        after_lines = collections.deque(lines[start + 1:])
-        k = 0
-        while after_lines:
-            line = after_lines.popleft()
-            if all_whitespace(line) or starting_whitespace(line) >= 1:
-                k += 1
-            else:
-                break
-        return start + k
-
-    # Use the rst parsers document output to do as much of the validation
-    # as we can without resorting to custom logic (this parser is what sphinx
-    # and others use anyway so it's very mature).
-    parser = rst.Parser()
-    defaults = {
-        'input_encoding': 'utf8',
-        'halt_level': 5,
-        'report_level': 5,
-        'quiet': True,
-        'file_insertion_enabled': False,
-        'traceback': True,
-    }
-    opt = frontend.OptionParser(components=[parser], defaults=defaults)
-    doc = utils.new_document(source_path=fn, settings=opt.get_default_values())
-    parser.parse(contents, doc)
-    node_lines = []
-    first_line = -1
-    for n in doc.traverse(include_self=True):
-        line = find_line(n)
-        if line is None:
-            continue
-        if any_node_type(n, [doc_nodes.system_message]):
-            # These are failures, and there node content isn't correct,
-            # so skip them; we should work on making it so that the parser
-            # stops doing this custom parent creation in the first place.
-            continue
-        if first_line == -1:
-            first_line = line
-        contained_lines = set(gather_lines(n))
-        node_lines.append((n, (min(contained_lines), max(contained_lines))))
-
-    # Find where directives start & end so that we can exclude content in
-    # these directive regions (the rst parser may not handle this correctly
-    # for unknown directives, so we have to do it manually).
-    lines = contents.split("\n")
-    directives = []
-    for i, line in enumerate(lines):
-        if re.match(r"^..\s(.*?)::\s*", line):
-            directives.append((i, find_directive_end(i, lines)))
-        elif re.match(r"^::\s*$", line):
-            directives.append((i, find_directive_end(i, lines)))
-
-    skip_types = (
-        doc_nodes.target,
-        doc_nodes.literal_block,
-    )
-    title_types = (
-        doc_nodes.title,
-    )
-    max_line_length = cfg['max_line_length']
-    allow_long = cfg['allow_long_titles']
-    for i, line in enumerate(lines):
-        if len(line) > max_line_length:
-            in_directive = False
-            for (start, end) in directives:
-                if i >= start and i <= end:
-                    in_directive = True
-                    break
-            if in_directive:
-                continue
-            stripped = line.strip()
-            if ' ' not in stripped:
-                continue
-            if contains_url(stripped):
-                continue
-            nodes = find_containing_nodes(i + 1, node_lines, first_line)
-            if any([isinstance(n, skip_types) for n in nodes]):
-                continue
-            if allow_long and any([isinstance(n, title_types) for n in nodes]):
-                continue
-            yield (i + 1, 'D001', 'Line too long')
-
-
-def check_trailing_whitespace(fn, cfg, line):
-    if TRAILING_WHITESPACE_REGEX.search(line):
-        yield ('D002', 'Trailing whitespace')
-
-
-def check_indentation_no_tab(fn, cfg, line):
-    match = STARTING_WHITESPACE_REGEX.search(line)
-    if match:
-        spaces = match.group(1)
-        if '\t' in spaces:
-            yield ('D003', 'Tabulation used for indentation')
-
-
-def check_carriage_return(fn, cfg, line):
-    if "\r" in line:
-        yield ('D004', 'Found literal carriage return')
-
-
-def check_lines(fn, cfg, lines, line_checks):
-    for idx, line in enumerate(lines, 1):
-        line = six.text_type(line, encoding='utf8')
-        line = line.rstrip('\n')
-        for check in line_checks:
-            for code, message in check(fn, cfg, line):
-                yield idx, code, message
-
-
-def check_files(cfg, filenames, line_checks, content_checks):
-    for fn in filenames:
-        with open(fn, 'rb') as fh:
-            content = six.text_type(fh.read(), encoding='utf8')
-            for content_check in content_checks:
-                for line_num, code, message in content_check(fn, cfg, content):
-                    yield fn, line_num, code, message
-            fh.seek(0)
-            for line_num, code, message in check_lines(fn, cfg,
-                                                       fh, line_checks):
-                yield fn, line_num, code, message
-
-
-def find_files(pathes, patterns):
-    for path in pathes:
-        if os.path.isfile(path):
-            yield path
-        elif os.path.isdir(path):
-            for root, dirnames, filenames in os.walk(path):
-                for filename in filenames:
-                    if any(fnmatch.fnmatch(filename, pattern)
-                           for pattern in patterns):
-                        yield os.path.join(root, filename)
-        else:
-            print('Invalid path: %s' % path)
-
-
-def split_string(text):
-    return [i.strip() for i in text.split(",") if i.strip()]
-
-
-def extract_config(args, default_cfg):
-    if args.config:
-        parser = configparser.RawConfigParser()
-        for fn in list(args.config):
-            with open(fn, 'r') as fh:
-                parser.readfp(fh, filename=fn)
-    else:
-        parser = configparser.RawConfigParser()
-        parser.read(CONFIG_FILENAMES)
-    cfg = dict(default_cfg)
-    try:
-        cfg['max_line_length'] = parser.getint("doc8", "max-line-length")
-    except (configparser.NoSectionError, configparser.NoOptionError):
-        pass
-    try:
-        ignores = parser.get("doc8", "ignore")
-    except (configparser.NoSectionError, configparser.NoOptionError):
-        pass
-    else:
-        cfg['ignore'].update(split_string(ignores))
-    try:
-        cfg['allow_long_titles'] = parser.getboolean("doc8",
-                                                     "allow-long-titles")
-    except (configparser.NoSectionError, configparser.NoOptionError):
-        pass
-    return cfg
-
-
-def unique_itr(itr):
-    seen = set()
-    for i in itr:
-        if i in seen:
-            continue
-        yield i
-        seen.add(i)
-
-
-def main():
-    file_types = ", ".join(FILE_PATTERNS)
-    default_configs = ", ".join(CONFIG_FILENAMES)
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument("paths", metavar='path', type=str, nargs='*',
-                        help=("path to scan for %s files"
-                              " (default: os.getcwd())") % file_types,
-                        default=[os.getcwd()])
-    parser.add_argument("--config", metavar='path', action="append",
-                        help="user config file location"
-                             " (default: %s)" % default_configs)
-    parser.add_argument("--allow-long-titles", action="store_true",
-                        help="allow long section titles (default: False)",
-                        default=False)
-    parser.add_argument("--ignore", action="append", metavar="code",
-                        help="ignore the given errors code/codes",
-                        default=[])
-    args = parser.parse_args()
-    default_cfg = {
-        'max_line_length': MAX_LINE_LENGTH,
-        'ignore': set(),
-        'allow_long_titles': args.allow_long_titles,
-    }
-    for c in args.ignore:
-        default_cfg['ignore'].update(split_string(c))
-    cfg = extract_config(args, default_cfg)
-    line_checks = [
-        check_trailing_whitespace,
-        check_indentation_no_tab,
-        check_carriage_return,
-    ]
-    content_checks = [
-        check_max_length,
-    ]
-    ok = True
-    paths = unique_itr(args.paths)
-    for error in check_files(cfg, find_files(paths, FILE_PATTERNS),
-                             line_checks, content_checks):
-        if error[2] in cfg['ignore']:
-            continue
-        ok = False
-        print('%s:%s: %s %s' % error)
-    sys.exit(0 if ok else 1)
-
-if __name__ == '__main__':
-    main()