diff --git a/cleanup/retf/README.md b/cleanup/retf/README.md new file mode 100644 index 00000000..ae8e03dc --- /dev/null +++ b/cleanup/retf/README.md @@ -0,0 +1,69 @@ +# retf.py + +This script applies a set of regular expressions onto a set of files +to automatically identify and fix typographical errors. + +## What does RETF mean? + +RETF means RegExTypoFix or Regular Expression Typographical error Fixer +and is a set of regular expressions to find and fix common misspellings +and grammatical errors. + +The regular expressions are available at +https://en.wikipedia.org/wiki/Wikipedia:AutoWikiBrowser/Typos. + +## Usage + +There are two ways to define the set of files. First you can simply add +single files using the parameter ```--file```. + +```$ ./retf.py --file path/to/file1 path/to/file2 path/to/file3``` + +Also you can specify paths using the parameter ```--path``` that should be +scanned for files. + +```$ ./retf.py --path path/with/files/1 path/with/files/2``` + +To not use all files inside the specified paths it's possible to filter +by the file extension. + +```$ ./retf.py --path path/with/files --extension xml txt rst``` + +It's possible to use the parameters ```--path``` and ```--file``` together. + +By default the script will only check for findings in all specified files. + +To automatically write back resolved findings add the parameter +```--write-changes```. Findings will then be written to a copy with +the ending ```.retf```. + +To fix findings directly in the files add the parameter +```--in-place```. Findings will than be fixed directly in the files. A backup file +with the ending ```.orig``` will be created. To disable backups add the +paramter ```--no-backup```. + +To only check if there are findings inside the defined set of files add + +To download the latest RETF rules from Wikipedia use the parameter ```--download```. + +## Needed Python modules + +* beautifulsoup4 / bs4 (https://pypi.python.org/pypi/beautifulsoup4) +* glob2 (https://pypi.python.org/pypi/glob2) +* pyyaml (https://pypi.python.org/pypi/pyaml) +* regex (https://pypi.python.org/pypi/regex) +* six (https://pypi.python.org/pypi/six) + +To install the needed modules you can use pip or the package management system included +in your distribution. When using the package management system maybe the name of the +packages differ. When using pip it's maybe necessary to install some development packages. +For example on Ubuntu 14.04 LTS you have to install ```libyaml-dev``` for ```pyyaml``` +and ```python-dev``` for ```regex```. + +``` +$ pip install beautifulsoup4 +$ pip install glob2 +$ pip install pyyaml +$ pip install regex +$ pip install six +``` diff --git a/cleanup/retf/disabled_rules.yaml b/cleanup/retf/disabled_rules.yaml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/cleanup/retf/disabled_rules.yaml @@ -0,0 +1 @@ +--- diff --git a/cleanup/retf/retf.py b/cleanup/retf/retf.py new file mode 100755 index 00000000..aecae03e --- /dev/null +++ b/cleanup/retf/retf.py @@ -0,0 +1,302 @@ +#!/usr/bin/python + +"""This script applies a set of regular expressions onto a set of files +to automatically identify and fix typographical errors. +""" + +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# author: Christian Berendt + +# Based on the idea of 'Topy' written by Marti Raudsepp . +# Topy is available on Github at https://github.com/intgr/topy. + +import argparse +import logging +import os +import shutil +import sys +import urllib2 + +from bs4 import BeautifulSoup +import glob2 +import regex +import six +import yaml + + +class DownloadRetfListingFailed(Exception): + """Exception will be raised when the download of the RETF + listing failed or the destination file could not be written. + """ + + pass + + +def download_listing(dest): + """Download the latest RETF listing from Wikipedia.""" + logger = logging.getLogger('retf') + try: + url = ('https://en.wikipedia.org/wiki/Wikipedia:AutoWikiBrowser/' + 'Typos?action=raw') + logger.debug("Downloading latest RETF listing from %s into %s.", + url, dest) + response = urllib2.urlopen(url) + data = response.read() + logger.info("Downloading latest RETF listing from %s succeeded.", url) + except urllib2.HTTPError as ex: + raise DownloadRetfListingFailed(six.text_type(ex)) + except urllib2.URLError as ex: + raise DownloadRetfListingFailed(six.text_type(ex)) + + try: + with open(dest, 'w+') as write: + write.write(data) + logger.info("Writing RETF listing to file %s succeeded.", dest) + except IOError as ex: + raise DownloadRetfListingFailed(six.text_type(ex)) + + +def soupify_listing(src): + """Parse a RETF listing.""" + return BeautifulSoup(open(src)) + + +def generate_listing(src): + """Compile all regular expressions in a RETF listing.""" + logger = logging.getLogger('retf') + result = [] + + soup = soupify_listing(src) + + for typo in soup.findAll('typo'): + try: + word = typo.attrs.get('word').encode('utf8') + find = typo.attrs.get('find').encode('utf8') + replace = typo.attrs.get('replace').encode('utf8') + replace = replace.replace(b'$', b'\\') + except AttributeError: + continue + + # pylint: disable=W0703 + try: + logger.debug("Compiling regular expression: %s.", find) + compiled = regex.compile(find, flags=regex.V1) + except Exception: + logger.error("Compilation of regular expression %f failed.", find) + continue + # pylint: enable=W0703 + + entry = { + 'description': word, + 'find': find, + 'replace': replace, + 'regex': compiled + } + + result.append(entry) + + logger.debug("Compiled %d regular expression(s).", len(result)) + + return result + + +def load_text_from_file(src): + """Load content from a file.""" + logger = logging.getLogger('retf') + logger.debug("Loading text from file %s.", src) + with open(src, 'rb') as fpointer: + text = fpointer.read() + + return text + + +def write_text_to_file(dest, text, no_backup, in_place): + """Write content into a file.""" + logger = logging.getLogger('retf') + + if not no_backup: + logger.debug("Copying %s to backup file %s.orig.", dest, dest) + shutil.copy2(dest, "%s.orig" % dest) + + if not in_place: + dest = "%s.retf" % dest + + logger.debug("Writing text to file %s.", dest) + with open(dest, 'wb') as fpointer: + fpointer.write(text) + + +def initialize_logging(debug, less_verbose): + """Initialze the Logger.""" + logger = logging.getLogger(name='retf') + formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') + handler = logging.StreamHandler() + handler.setFormatter(formatter) + logger.addHandler(handler) + + logger.setLevel(logging.INFO) + + if less_verbose: + logger.setLevel(logging.WARN) + + if debug: + logger.setLevel(logging.DEBUG) + + return logging.getLogger('retf') + + +def parse_command_line_arguments(): + """Parse the command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("--debug", help="Print debugging messages.", + action="store_true", default=False) + parser.add_argument("--download", help="Download the latest RETF listing.", + action="store_true", default=False) + parser.add_argument("--less-verbose", help="Be less verbose.", + action="store_true", default=False) + parser.add_argument("--no-backup", help="Don't backup files.", + action="store_true", default=False) + parser.add_argument("--in-place", help="Resolve found errors in place.", + action="store_true", default=False) + parser.add_argument("--write-changes", action="store_true", default=False, + help="Write resolved findings back to files.") + parser.add_argument("--disabled", type=str, default=None, + help="File containing the disabled rules.") + parser.add_argument("--listing", help="File containing the RETF listing.", + type=str, default=os.path.join( + os.path.dirname(os.path.realpath(__file__)), + 'retf.lst')) + parser.add_argument("--path", type=str, nargs='*', default=[], + help="Path(s) that should be checked.") + parser.add_argument("--extension", type=str, nargs='*', default=[], + help="Only check files with specified extension(s).") + parser.add_argument("--file", nargs='*', type=str, default=[], + help="File(s) to check for typographical errors.") + return (parser, parser.parse_args()) + + +def load_disabled_rules(src): + """Load disabled rules from YAML file.""" + logger = logging.getLogger('retf') + listing = [] + + if src: + try: + listing = yaml.load(open(src)) + for rule in listing: + logger.debug("Rule '%s' is disabled.", rule) + + except IOError: + logger.error("loading disabled rules from file %s failed", src) + + return listing + + +def get_file_listing(paths, files, extensions): + """Generate listing with all files that should be check.""" + result = [] + if files: + result += files + + # pylint: disable=E1101 + for path in paths: + if extensions: + for extension in extensions: + result += glob2.glob("%s/**/*.%s" % (path, extension)) + else: + result += glob2.glob("%s/**/*" % path) + # pylint: enable=E1101 + + return result + + +def check_file(src, rules, disabled): + """Applies a set of rules on a file.""" + logger = logging.getLogger('retf') + logger.info("Checking file %s for typographical errors.", src) + content = load_text_from_file(src) + findings = 0 + + for rule in rules: + if rule.get('description') in disabled: + continue + + logger.debug("%s: checking rule '%s'.", file, + rule.get('description')) + logger.debug(rule.get('find')) + newcontent, count = rule.get('regex').subn( + rule.get('replace'), content + ) + + if count > 0: + logger.warning("%d match(s) in file %s : %s.", count, file, + rule.get('description')) + findings += count + content = newcontent + + return (findings, content) + + +def main(): + """Entry point for this script.""" + + parser, args = parse_command_line_arguments() + logger = initialize_logging(args.debug, args.less_verbose) + + result = 0 + + if args.download: + try: + download_listing(args.listing) + except DownloadRetfListingFailed as ex: + logger.error("Downloading latest RETF listing failed: %s.", ex) + result = 1 + + if not args.path and not args.file and not args.download: + parser.print_help() + result = 2 + + if not result and not os.path.isfile(args.listing): + logger.error("RETF listing not found at %s.", args.listing) + logger.info("Please download the RETF listing first by using the " + "parameter --download.") + result = 1 + + if not result: + files = get_file_listing(args.path, args.file, args.extension) + + rules = generate_listing(args.listing) + disabled = load_disabled_rules(args.disabled) + + all_findings = 0 + for check in files: + + (findings, content) = check_file(check, rules, disabled) + + if findings > 0: + all_findings += findings + logger.warning("%s finding(s) in file %s.", findings, check) + + if findings > 0 and args.write_changes: + write_text_to_file(check, content, args.no_backup, + args.in_place) + + if all_findings > 0: + logger.warning("%s finding(s) in all checked files.", all_findings) + result = 1 + + return result + +if __name__ == "__main__": + sys.exit(main())