Merge "script to apply RETF rules onto a set of files"
This commit is contained in:
commit
b62e875bb1
69
cleanup/retf/README.md
Normal file
69
cleanup/retf/README.md
Normal file
@ -0,0 +1,69 @@
|
||||
# retf.py
|
||||
|
||||
This script applies a set of regular expressions onto a set of files
|
||||
to automatically identify and fix typographical errors.
|
||||
|
||||
## What does RETF mean?
|
||||
|
||||
RETF means RegExTypoFix or Regular Expression Typographical error Fixer
|
||||
and is a set of regular expressions to find and fix common misspellings
|
||||
and grammatical errors.
|
||||
|
||||
The regular expressions are available at
|
||||
https://en.wikipedia.org/wiki/Wikipedia:AutoWikiBrowser/Typos.
|
||||
|
||||
## Usage
|
||||
|
||||
There are two ways to define the set of files. First you can simply add
|
||||
single files using the parameter ```--file```.
|
||||
|
||||
```$ ./retf.py --file path/to/file1 path/to/file2 path/to/file3```
|
||||
|
||||
Also you can specify paths using the parameter ```--path``` that should be
|
||||
scanned for files.
|
||||
|
||||
```$ ./retf.py --path path/with/files/1 path/with/files/2```
|
||||
|
||||
To not use all files inside the specified paths it's possible to filter
|
||||
by the file extension.
|
||||
|
||||
```$ ./retf.py --path path/with/files --extension xml txt rst```
|
||||
|
||||
It's possible to use the parameters ```--path``` and ```--file``` together.
|
||||
|
||||
By default the script will only check for findings in all specified files.
|
||||
|
||||
To automatically write back resolved findings add the parameter
|
||||
```--write-changes```. Findings will then be written to a copy with
|
||||
the ending ```.retf```.
|
||||
|
||||
To fix findings directly in the files add the parameter
|
||||
```--in-place```. Findings will than be fixed directly in the files. A backup file
|
||||
with the ending ```.orig``` will be created. To disable backups add the
|
||||
paramter ```--no-backup```.
|
||||
|
||||
To only check if there are findings inside the defined set of files add
|
||||
|
||||
To download the latest RETF rules from Wikipedia use the parameter ```--download```.
|
||||
|
||||
## Needed Python modules
|
||||
|
||||
* beautifulsoup4 / bs4 (https://pypi.python.org/pypi/beautifulsoup4)
|
||||
* glob2 (https://pypi.python.org/pypi/glob2)
|
||||
* pyyaml (https://pypi.python.org/pypi/pyaml)
|
||||
* regex (https://pypi.python.org/pypi/regex)
|
||||
* six (https://pypi.python.org/pypi/six)
|
||||
|
||||
To install the needed modules you can use pip or the package management system included
|
||||
in your distribution. When using the package management system maybe the name of the
|
||||
packages differ. When using pip it's maybe necessary to install some development packages.
|
||||
For example on Ubuntu 14.04 LTS you have to install ```libyaml-dev``` for ```pyyaml```
|
||||
and ```python-dev``` for ```regex```.
|
||||
|
||||
```
|
||||
$ pip install beautifulsoup4
|
||||
$ pip install glob2
|
||||
$ pip install pyyaml
|
||||
$ pip install regex
|
||||
$ pip install six
|
||||
```
|
1
cleanup/retf/disabled_rules.yaml
Normal file
1
cleanup/retf/disabled_rules.yaml
Normal file
@ -0,0 +1 @@
|
||||
---
|
302
cleanup/retf/retf.py
Executable file
302
cleanup/retf/retf.py
Executable file
@ -0,0 +1,302 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
"""This script applies a set of regular expressions onto a set of files
|
||||
to automatically identify and fix typographical errors.
|
||||
"""
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# author: Christian Berendt <berendt@b1-systems.de>
|
||||
|
||||
# Based on the idea of 'Topy' written by Marti Raudsepp <marti@juffo.org>.
|
||||
# Topy is available on Github at https://github.com/intgr/topy.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import urllib2
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import glob2
|
||||
import regex
|
||||
import six
|
||||
import yaml
|
||||
|
||||
|
||||
class DownloadRetfListingFailed(Exception):
|
||||
"""Exception will be raised when the download of the RETF
|
||||
listing failed or the destination file could not be written.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
def download_listing(dest):
|
||||
"""Download the latest RETF listing from Wikipedia."""
|
||||
logger = logging.getLogger('retf')
|
||||
try:
|
||||
url = ('https://en.wikipedia.org/wiki/Wikipedia:AutoWikiBrowser/'
|
||||
'Typos?action=raw')
|
||||
logger.debug("Downloading latest RETF listing from %s into %s.",
|
||||
url, dest)
|
||||
response = urllib2.urlopen(url)
|
||||
data = response.read()
|
||||
logger.info("Downloading latest RETF listing from %s succeeded.", url)
|
||||
except urllib2.HTTPError as ex:
|
||||
raise DownloadRetfListingFailed(six.text_type(ex))
|
||||
except urllib2.URLError as ex:
|
||||
raise DownloadRetfListingFailed(six.text_type(ex))
|
||||
|
||||
try:
|
||||
with open(dest, 'w+') as write:
|
||||
write.write(data)
|
||||
logger.info("Writing RETF listing to file %s succeeded.", dest)
|
||||
except IOError as ex:
|
||||
raise DownloadRetfListingFailed(six.text_type(ex))
|
||||
|
||||
|
||||
def soupify_listing(src):
|
||||
"""Parse a RETF listing."""
|
||||
return BeautifulSoup(open(src))
|
||||
|
||||
|
||||
def generate_listing(src):
|
||||
"""Compile all regular expressions in a RETF listing."""
|
||||
logger = logging.getLogger('retf')
|
||||
result = []
|
||||
|
||||
soup = soupify_listing(src)
|
||||
|
||||
for typo in soup.findAll('typo'):
|
||||
try:
|
||||
word = typo.attrs.get('word').encode('utf8')
|
||||
find = typo.attrs.get('find').encode('utf8')
|
||||
replace = typo.attrs.get('replace').encode('utf8')
|
||||
replace = replace.replace(b'$', b'\\')
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
# pylint: disable=W0703
|
||||
try:
|
||||
logger.debug("Compiling regular expression: %s.", find)
|
||||
compiled = regex.compile(find, flags=regex.V1)
|
||||
except Exception:
|
||||
logger.error("Compilation of regular expression %f failed.", find)
|
||||
continue
|
||||
# pylint: enable=W0703
|
||||
|
||||
entry = {
|
||||
'description': word,
|
||||
'find': find,
|
||||
'replace': replace,
|
||||
'regex': compiled
|
||||
}
|
||||
|
||||
result.append(entry)
|
||||
|
||||
logger.debug("Compiled %d regular expression(s).", len(result))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_text_from_file(src):
|
||||
"""Load content from a file."""
|
||||
logger = logging.getLogger('retf')
|
||||
logger.debug("Loading text from file %s.", src)
|
||||
with open(src, 'rb') as fpointer:
|
||||
text = fpointer.read()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def write_text_to_file(dest, text, no_backup, in_place):
|
||||
"""Write content into a file."""
|
||||
logger = logging.getLogger('retf')
|
||||
|
||||
if not no_backup:
|
||||
logger.debug("Copying %s to backup file %s.orig.", dest, dest)
|
||||
shutil.copy2(dest, "%s.orig" % dest)
|
||||
|
||||
if not in_place:
|
||||
dest = "%s.retf" % dest
|
||||
|
||||
logger.debug("Writing text to file %s.", dest)
|
||||
with open(dest, 'wb') as fpointer:
|
||||
fpointer.write(text)
|
||||
|
||||
|
||||
def initialize_logging(debug, less_verbose):
|
||||
"""Initialze the Logger."""
|
||||
logger = logging.getLogger(name='retf')
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
if less_verbose:
|
||||
logger.setLevel(logging.WARN)
|
||||
|
||||
if debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
return logging.getLogger('retf')
|
||||
|
||||
|
||||
def parse_command_line_arguments():
|
||||
"""Parse the command line arguments."""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--debug", help="Print debugging messages.",
|
||||
action="store_true", default=False)
|
||||
parser.add_argument("--download", help="Download the latest RETF listing.",
|
||||
action="store_true", default=False)
|
||||
parser.add_argument("--less-verbose", help="Be less verbose.",
|
||||
action="store_true", default=False)
|
||||
parser.add_argument("--no-backup", help="Don't backup files.",
|
||||
action="store_true", default=False)
|
||||
parser.add_argument("--in-place", help="Resolve found errors in place.",
|
||||
action="store_true", default=False)
|
||||
parser.add_argument("--write-changes", action="store_true", default=False,
|
||||
help="Write resolved findings back to files.")
|
||||
parser.add_argument("--disabled", type=str, default=None,
|
||||
help="File containing the disabled rules.")
|
||||
parser.add_argument("--listing", help="File containing the RETF listing.",
|
||||
type=str, default=os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)),
|
||||
'retf.lst'))
|
||||
parser.add_argument("--path", type=str, nargs='*', default=[],
|
||||
help="Path(s) that should be checked.")
|
||||
parser.add_argument("--extension", type=str, nargs='*', default=[],
|
||||
help="Only check files with specified extension(s).")
|
||||
parser.add_argument("--file", nargs='*', type=str, default=[],
|
||||
help="File(s) to check for typographical errors.")
|
||||
return (parser, parser.parse_args())
|
||||
|
||||
|
||||
def load_disabled_rules(src):
|
||||
"""Load disabled rules from YAML file."""
|
||||
logger = logging.getLogger('retf')
|
||||
listing = []
|
||||
|
||||
if src:
|
||||
try:
|
||||
listing = yaml.load(open(src))
|
||||
for rule in listing:
|
||||
logger.debug("Rule '%s' is disabled.", rule)
|
||||
|
||||
except IOError:
|
||||
logger.error("loading disabled rules from file %s failed", src)
|
||||
|
||||
return listing
|
||||
|
||||
|
||||
def get_file_listing(paths, files, extensions):
|
||||
"""Generate listing with all files that should be check."""
|
||||
result = []
|
||||
if files:
|
||||
result += files
|
||||
|
||||
# pylint: disable=E1101
|
||||
for path in paths:
|
||||
if extensions:
|
||||
for extension in extensions:
|
||||
result += glob2.glob("%s/**/*.%s" % (path, extension))
|
||||
else:
|
||||
result += glob2.glob("%s/**/*" % path)
|
||||
# pylint: enable=E1101
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def check_file(src, rules, disabled):
|
||||
"""Applies a set of rules on a file."""
|
||||
logger = logging.getLogger('retf')
|
||||
logger.info("Checking file %s for typographical errors.", src)
|
||||
content = load_text_from_file(src)
|
||||
findings = 0
|
||||
|
||||
for rule in rules:
|
||||
if rule.get('description') in disabled:
|
||||
continue
|
||||
|
||||
logger.debug("%s: checking rule '%s'.", file,
|
||||
rule.get('description'))
|
||||
logger.debug(rule.get('find'))
|
||||
newcontent, count = rule.get('regex').subn(
|
||||
rule.get('replace'), content
|
||||
)
|
||||
|
||||
if count > 0:
|
||||
logger.warning("%d match(s) in file %s : %s.", count, file,
|
||||
rule.get('description'))
|
||||
findings += count
|
||||
content = newcontent
|
||||
|
||||
return (findings, content)
|
||||
|
||||
|
||||
def main():
|
||||
"""Entry point for this script."""
|
||||
|
||||
parser, args = parse_command_line_arguments()
|
||||
logger = initialize_logging(args.debug, args.less_verbose)
|
||||
|
||||
result = 0
|
||||
|
||||
if args.download:
|
||||
try:
|
||||
download_listing(args.listing)
|
||||
except DownloadRetfListingFailed as ex:
|
||||
logger.error("Downloading latest RETF listing failed: %s.", ex)
|
||||
result = 1
|
||||
|
||||
if not args.path and not args.file and not args.download:
|
||||
parser.print_help()
|
||||
result = 2
|
||||
|
||||
if not result and not os.path.isfile(args.listing):
|
||||
logger.error("RETF listing not found at %s.", args.listing)
|
||||
logger.info("Please download the RETF listing first by using the "
|
||||
"parameter --download.")
|
||||
result = 1
|
||||
|
||||
if not result:
|
||||
files = get_file_listing(args.path, args.file, args.extension)
|
||||
|
||||
rules = generate_listing(args.listing)
|
||||
disabled = load_disabled_rules(args.disabled)
|
||||
|
||||
all_findings = 0
|
||||
for check in files:
|
||||
|
||||
(findings, content) = check_file(check, rules, disabled)
|
||||
|
||||
if findings > 0:
|
||||
all_findings += findings
|
||||
logger.warning("%s finding(s) in file %s.", findings, check)
|
||||
|
||||
if findings > 0 and args.write_changes:
|
||||
write_text_to_file(check, content, args.no_backup,
|
||||
args.in_place)
|
||||
|
||||
if all_findings > 0:
|
||||
logger.warning("%s finding(s) in all checked files.", all_findings)
|
||||
result = 1
|
||||
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user