Files
codespell/codespell_lib/_codespell.py
Peter Newman d978da6e31 Check errors don't exist as valid words in the aspell dictionary (#1142)
* Check errors don't exist as valid words in the aspell dictionary

* Install aspell on Travis

* Add some missing packages

* Remove a virtual package

* Just install the version of aspell-python we need

* Keep flake8 happy

* Switch to warnings and count them, so we can see all the aspell errors in one go

* Handle different encoding of the word and aspell

* Try and fix the encoding conversion

* Find out the encoding type

* Don't assert on number of warnings

* Don't record warnings for now

* Warn on all the encoding options

* pprint the encoding

* More warning work

* Use the actual encoding type

* Correct the logic

* ENH: Multi dict support

* FIX: Fixes after merge

* FIX: Better error check

* FIX: More thorough testing, locations

* FIX: Try newer aspell

* FIX: Move to new dict

* FIX: Move

* FIX: Restore removals from #1181

* FIX: One from #1362

* Add rare chack->check, cheque,

* Minor tidy of some dictionary check code

* Add some more suggestions.

* Fix the whitespace

* Really fix the whitespace

* FIX: Refactor requirement

* Log an error when aspell not found and not required

* Fix the error logging

* Test all variants of present and missing from aspell

* Undo some tuple tidying

* Fix the true/false values used

* Skip some flake8 tests

* Fix the test cases

* Correct the not in aspell test and fix some test cases

* Remove a duplicate test

* Use a test word that isn't a typo

* Set the ideal aspell detection logic for each dictionary

I suspect we'll have to relax this as more obscure words won't be in the aspell dictionary

* Be more realistic given the size of the dictionary

* Fix a flake8 error

* Fix another line length error

* FIX: Move

* FIX: Make visible, simplify

Co-authored-by: Eric Larson <larson.eric.d@gmail.com>
2020-04-06 12:54:16 -04:00

741 lines
27 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
"""
Copyright (C) 2010-2011 Lucas De Marchi <lucas.de.marchi@gmail.com>
Copyright (C) 2011 ProFUSION embedded systems
"""
from __future__ import print_function
import argparse
import codecs
import fnmatch
import os
import re
import sys
word_regex_def = u"[\\w\\-'`]+"
encodings = ('utf-8', 'iso-8859-1')
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
"""
VERSION = '1.17.0.dev0'
# Users might want to link this file into /usr/local/bin, so we resolve the
# symbolic link path to the real path if necessary.
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
_builtin_dictionaries = ( # name, desc, name, err in aspell, correction in aspell # noqa: E501
# The aspell tests here aren't the ideal state, but the None's are realistic
# for obscure words
('clear', 'for unambiguous errors', '', False, None),
('rare', 'for rare but valid words', '_rare', None, None),
('informal', 'for informal words', '_informal', True, True),
('code', 'for words common to code and/or mathematics', '_code', None, None), # noqa: E501
('names', 'for valid proper names that might be typos', '_names', None, None), # noqa: E501
('en-GB_to_en-US', 'for corrections from en-GB to en-US', '_en-GB_to_en-US', True, True), # noqa: E501
)
_builtin_default = 'clear,rare'
# OPTIONS:
#
# ARGUMENTS:
# dict_filename The file containing the dictionary of misspellings.
# If set to '-', it will be read from stdin
# file1 .. fileN Files to check spelling
class QuietLevels(object):
NONE = 0
ENCODING = 1
BINARY_FILE = 2
DISABLED_FIXES = 4
NON_AUTOMATIC_FIXES = 8
FIXES = 16
class GlobMatch(object):
def __init__(self, pattern):
if pattern:
# Pattern might be a list of comma-delimited strings
self.pattern_list = ','.join(pattern).split(',')
else:
self.pattern_list = None
def match(self, filename):
if self.pattern_list is None:
return False
for p in self.pattern_list:
if fnmatch.fnmatch(filename, p):
return True
return False
class Misspelling(object):
def __init__(self, data, fix, reason):
self.data = data
self.fix = fix
self.reason = reason
class TermColors(object):
def __init__(self):
self.FILE = '\033[33m'
self.WWORD = '\033[31m'
self.FWORD = '\033[32m'
self.DISABLE = '\033[0m'
def disable(self):
self.FILE = ''
self.WWORD = ''
self.FWORD = ''
self.DISABLE = ''
class Summary(object):
def __init__(self):
self.summary = {}
def update(self, wrongword):
if wrongword in self.summary:
self.summary[wrongword] += 1
else:
self.summary[wrongword] = 1
def __str__(self):
keys = list(self.summary.keys())
keys.sort()
return "\n".join(["{0}{1:{width}}".format(
key,
self.summary.get(key),
width=15 - len(key)) for key in keys])
class FileOpener(object):
def __init__(self, use_chardet, quiet_level):
self.use_chardet = use_chardet
if use_chardet:
self.init_chardet()
self.quiet_level = quiet_level
def init_chardet(self):
try:
from chardet.universaldetector import UniversalDetector
except ImportError:
raise ImportError("There's no chardet installed to import from. "
"Please, install it and check your PYTHONPATH "
"environment variable")
self.encdetector = UniversalDetector()
def open(self, filename):
if self.use_chardet:
return self.open_with_chardet(filename)
else:
return self.open_with_internal(filename)
def open_with_chardet(self, filename):
self.encdetector.reset()
with codecs.open(filename, 'rb') as f:
for line in f:
self.encdetector.feed(line)
if self.encdetector.done:
break
self.encdetector.close()
encoding = self.encdetector.result['encoding']
try:
f = codecs.open(filename, 'r', encoding=encoding)
except UnicodeDecodeError:
print('ERROR: Could not detect encoding: %s' % filename,
file=sys.stderr)
raise
except LookupError:
print('ERROR: %s -- Don\'t know how to handle encoding %s'
% (filename, encoding), file=sys.stderr)
raise
else:
lines = f.readlines()
f.close()
return lines, encoding
def open_with_internal(self, filename):
curr = 0
while True:
try:
f = codecs.open(filename, 'r', encoding=encodings[curr])
except UnicodeDecodeError:
if not self.quiet_level & QuietLevels.ENCODING:
print('WARNING: Decoding file %s' % filename,
file=sys.stderr)
print('WARNING: using encoding=%s failed. '
% encodings[curr], file=sys.stderr)
try:
print('WARNING: Trying next encoding: %s'
% encodings[curr + 1], file=sys.stderr)
except IndexError:
pass
curr += 1
else:
lines = f.readlines()
f.close()
break
if not lines:
raise Exception('Unknown encoding')
encoding = encodings[curr]
return lines, encoding
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
def parse_options(args):
parser = argparse.ArgumentParser()
parser.set_defaults(colors=sys.stdout.isatty())
parser.add_argument('--version', action='version', version=VERSION)
parser.add_argument('-d', '--disable-colors',
action='store_false', dest='colors',
help='disable colors, even when printing to terminal '
'(always set for Windows)')
parser.add_argument('-c', '--enable-colors',
action='store_true', dest='colors',
help='enable colors, even when not printing to '
'terminal')
parser.add_argument('-w', '--write-changes',
action='store_true', default=False,
help='write changes in place if possible')
parser.add_argument('-D', '--dictionary',
action='append',
help='Custom dictionary file that contains spelling '
'corrections. If this flag is not specified or '
'equals "-" then the default dictionary is used. '
'This option can be specified multiple times.')
builtin_opts = ', '.join(
'%r %s' % (d[0], d[1]) for d in _builtin_dictionaries)
parser.add_argument('--builtin',
dest='builtin', default=_builtin_default,
metavar='BUILTIN-LIST',
help='Comma-separated list of builtin dictionaries '
'to include (when "-D -" or no "-D" is passed). '
'Current options are:\n%s. The default is '
'"--builtin %s".'
% (builtin_opts, _builtin_default))
parser.add_argument('-I', '--ignore-words',
action='append', metavar='FILE',
help='File that contains words which will be ignored '
'by codespell. File must contain 1 word per line.'
' Words are case sensitive based on how they are '
'written in the dictionary file')
parser.add_argument('-L', '--ignore-words-list',
action='append', metavar='WORDS',
help='Comma separated list of words to be ignored '
'by codespell. Words are case sensitive based on '
'how they are written in the dictionary file')
parser.add_argument('-r', '--regex',
action='store', type=str,
help='Regular expression which is used to find words. '
'By default any alphanumeric character, the '
'underscore, the hyphen, and the apostrophe is '
'used to build words. This option cannot be '
'specified together with --write-changes.')
parser.add_argument('-s', '--summary',
action='store_true', default=False,
help='print summary of fixes')
parser.add_argument('-S', '--skip',
action='append',
help='Comma-separated list of files to skip. It '
'accepts globs as well. E.g.: if you want '
'codespell to skip .eps and .txt files, '
'you\'d give "*.eps,*.txt" to this option.')
parser.add_argument('-x', '--exclude-file', type=str, metavar='FILE',
help='FILE with lines that should not be changed')
parser.add_argument('-i', '--interactive',
action='store', type=int, default=0,
help='Set interactive mode when writing changes. '
'0: no interactivity. 1: ask for confirmation. '
'2 ask user to choose one fix when more than one '
'is available. 3: both 1 and 2')
parser.add_argument('-q', '--quiet-level',
action='store', type=int, default=0,
help='Bitmask that allows codespell to run quietly. '
'0: the default, in which all messages are '
'printed. 1: disable warnings about wrong '
'encoding. 2: disable warnings about binary '
'file. 4: shut down warnings about automatic '
'fixes that were disabled in dictionary. '
'8: don\'t print anything for non-automatic '
'fixes. 16: don\'t print fixed files.')
parser.add_argument('-e', '--hard-encoding-detection',
action='store_true', default=False,
help='Use chardet to detect the encoding of each '
'file. This can slow down codespell, but is more '
'reliable in detecting encodings other than '
'utf-8, iso8859-1, and ascii.')
parser.add_argument('-f', '--check-filenames',
action='store_true', default=False,
help='check file names as well')
parser.add_argument('-H', '--check-hidden',
action='store_true', default=False,
help='check hidden files (those starting with ".") as '
'well')
parser.add_argument('-A', '--after-context', type=int, metavar='LINES',
help='print LINES of trailing context')
parser.add_argument('-B', '--before-context', type=int, metavar='LINES',
help='print LINES of leading context')
parser.add_argument('-C', '--context', type=int, metavar='LINES',
help='print LINES of surrounding context')
parser.add_argument('files', nargs='*',
help='files or directories to check')
options = parser.parse_args(list(args))
if not options.files:
options.files.append('.')
return options, parser
def build_exclude_hashes(filename, exclude_lines):
with codecs.open(filename, 'r') as f:
for line in f:
exclude_lines.add(line)
def build_ignore_words(filename, ignore_words):
with codecs.open(filename, mode='r', encoding='utf-8') as f:
for line in f:
ignore_words.add(line.strip())
def build_dict(filename, misspellings, ignore_words):
with codecs.open(filename, mode='r', encoding='utf-8') as f:
for line in f:
[key, data] = line.split('->')
# TODO for now, convert both to lower. Someday we can maybe add
# support for fixing caps.
key = key.lower()
data = data.lower()
if key in ignore_words:
continue
data = data.strip()
fix = data.rfind(',')
if fix < 0:
fix = True
reason = ''
elif fix == (len(data) - 1):
data = data[:fix]
reason = ''
fix = False
else:
reason = data[fix + 1:].strip()
data = data[:fix]
fix = False
misspellings[key] = Misspelling(data, fix, reason)
def is_hidden(filename, check_hidden):
bfilename = os.path.basename(filename)
return bfilename not in ('', '.', '..') and \
(not check_hidden and bfilename[0] == '.')
def is_text_file(filename):
with open(filename, mode='rb') as f:
s = f.read(1024)
if b'\x00' in s:
return False
return True
def fix_case(word, fixword):
if word == word.capitalize():
return fixword.capitalize()
elif word == word.upper():
return fixword.upper()
# they are both lower case
# or we don't have any idea
return fixword
def ask_for_word_fix(line, wrongword, misspelling, interactivity):
if interactivity <= 0:
return misspelling.fix, fix_case(wrongword, misspelling.data)
if misspelling.fix and interactivity & 1:
r = ''
fixword = fix_case(wrongword, misspelling.data)
while not r:
print("%s\t%s ==> %s (Y/n) " % (line, wrongword, fixword), end='')
r = sys.stdin.readline().strip().upper()
if not r:
r = 'Y'
if r != 'Y' and r != 'N':
print("Say 'y' or 'n'")
r = ''
if r == 'N':
misspelling.fix = False
misspelling.fixword = ''
elif (interactivity & 2) and not misspelling.reason:
# if it is not disabled, i.e. it just has more than one possible fix,
# we ask the user which word to use
r = ''
opt = list(map(lambda x: x.strip(), misspelling.data.split(',')))
while not r:
print("%s Choose an option (blank for none): " % line, end='')
for i in range(len(opt)):
fixword = fix_case(wrongword, opt[i])
print(" %d) %s" % (i, fixword), end='')
print(": ", end='')
sys.stdout.flush()
n = sys.stdin.readline().strip()
if not n:
break
try:
n = int(n)
r = opt[n]
except (ValueError, IndexError):
print("Not a valid option\n")
if r:
misspelling.fix = True
misspelling.data = r
return misspelling.fix, fix_case(wrongword, misspelling.data)
def print_context(lines, index, context):
# context = (context_before, context_after)
for i in range(index - context[0], index + context[1] + 1):
if 0 <= i < len(lines):
print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
def parse_file(filename, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options):
bad_count = 0
lines = None
changed = False
encoding = encodings[0] # if not defined, use UTF-8
if filename == '-':
f = sys.stdin
lines = f.readlines()
else:
# ignore binary files
if not os.path.isfile(filename):
return 0
if options.check_filenames:
for word in word_regex.findall(filename):
lword = word.lower()
if lword not in misspellings:
continue
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if summary and fix:
summary.update(lword)
cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
cwrongword = "%s%s%s" % (colors.WWORD, word, colors.DISABLE)
crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
if misspellings[lword].reason:
if options.quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = " | %s%s%s" % (colors.FILE,
misspellings[lword].reason,
colors.DISABLE)
else:
if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ''
bad_count += 1
print("%(FILENAME)s: %(WRONGWORD)s "
" ==> %(RIGHTWORD)s%(REASON)s"
% {'FILENAME': cfilename,
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
text = is_text_file(filename)
if not text:
if not options.quiet_level & QuietLevels.BINARY_FILE:
print("WARNING: Binary file: %s " % filename, file=sys.stderr)
return 0
try:
lines, encoding = file_opener.open(filename)
except Exception:
return 0
for i, line in enumerate(lines):
if line in exclude_lines:
continue
fixed_words = set()
asked_for = set()
for word in word_regex.findall(line):
lword = word.lower()
if lword in misspellings:
context_shown = False
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if options.interactive and lword not in asked_for:
if context is not None:
context_shown = True
print_context(lines, i, context)
fix, fixword = ask_for_word_fix(
lines[i], word, misspellings[lword],
options.interactive)
asked_for.add(lword)
if summary and fix:
summary.update(lword)
if word in fixed_words: # can skip because of re.sub below
continue
if options.write_changes and fix:
changed = True
lines[i] = re.sub(r'\b%s\b' % word, fixword, lines[i])
fixed_words.add(word)
continue
# otherwise warning was explicitly set by interactive mode
if (options.interactive & 2 and not fix and not
misspellings[lword].reason):
continue
cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
cline = "%s%d%s" % (colors.FILE, i + 1, colors.DISABLE)
cwrongword = "%s%s%s" % (colors.WWORD, word, colors.DISABLE)
crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
if misspellings[lword].reason:
if options.quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = " | %s%s%s" % (colors.FILE,
misspellings[lword].reason,
colors.DISABLE)
else:
if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ''
# If we get to this point (uncorrected error) we should change
# our bad_count and thus return value
bad_count += 1
if (not context_shown) and (context is not None):
print_context(lines, i, context)
if filename != '-':
print("%(FILENAME)s:%(LINE)s: %(WRONGWORD)s "
" ==> %(RIGHTWORD)s%(REASON)s"
% {'FILENAME': cfilename, 'LINE': cline,
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
else:
print('%(LINE)s: %(STRLINE)s\n\t%(WRONGWORD)s '
'==> %(RIGHTWORD)s%(REASON)s'
% {'LINE': cline, 'STRLINE': line.strip(),
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
if changed:
if filename == '-':
print("---")
for line in lines:
print(line, end='')
else:
if not options.quiet_level & QuietLevels.FIXES:
print("%sFIXED:%s %s"
% (colors.FWORD, colors.DISABLE, filename),
file=sys.stderr)
with codecs.open(filename, 'w', encoding=encoding) as f:
f.writelines(lines)
return bad_count
def _script_main():
"""Wrap to main() for setuptools."""
return main(*sys.argv[1:])
def main(*args):
"""Contains flow control"""
options, parser = parse_options(args)
if options.regex and options.write_changes:
print('ERROR: --write-changes cannot be used together with '
'--regex')
parser.print_help()
return 1
word_regex = options.regex or word_regex_def
try:
word_regex = re.compile(word_regex)
except re.error as err:
print('ERROR: invalid regular expression "%s" (%s)' %
(word_regex, err), file=sys.stderr)
parser.print_help()
return 1
ignore_words_files = options.ignore_words or []
ignore_words = set()
for ignore_words_file in ignore_words_files:
if not os.path.isfile(ignore_words_file):
print('ERROR: cannot find ignore-words file: %s' %
ignore_words_file, file=sys.stderr)
parser.print_help()
return 1
build_ignore_words(ignore_words_file, ignore_words)
ignore_words_list = options.ignore_words_list or []
for comma_separated_words in ignore_words_list:
for word in comma_separated_words.split(','):
ignore_words.add(word.strip())
if options.dictionary:
dictionaries = options.dictionary
else:
dictionaries = ['-']
use_dictionaries = list()
for dictionary in dictionaries:
if dictionary == "-":
# figure out which builtin dictionaries to use
use = sorted(set(options.builtin.split(',')))
for u in use:
for builtin in _builtin_dictionaries:
if builtin[0] == u:
use_dictionaries.append(
os.path.join(_data_root, 'dictionary%s.txt'
% (builtin[2],)))
break
else:
print('ERROR: Unknown builtin dictionary: %s' % (u,),
file=sys.stderr)
parser.print_help()
return 1
else:
if not os.path.isfile(dictionary):
print('ERROR: cannot find dictionary file: %s' % dictionary,
file=sys.stderr)
parser.print_help()
return 1
use_dictionaries.append(dictionary)
misspellings = dict()
for dictionary in use_dictionaries:
build_dict(dictionary, misspellings, ignore_words)
colors = TermColors()
if not options.colors or sys.platform == 'win32':
colors.disable()
if options.summary:
summary = Summary()
else:
summary = None
context = None
if options.context is not None:
if (options.before_context is not None) or \
(options.after_context is not None):
print('ERROR: --context/-C cannot be used together with '
'--context-before/-B or --context-after/-A')
parser.print_help()
return 1
context_both = max(0, options.context)
context = (context_both, context_both)
elif (options.before_context is not None) or \
(options.after_context is not None):
context_before = 0
context_after = 0
if options.before_context is not None:
context_before = max(0, options.before_context)
if options.after_context is not None:
context_after = max(0, options.after_context)
context = (context_before, context_after)
exclude_lines = set()
if options.exclude_file:
build_exclude_hashes(options.exclude_file, exclude_lines)
file_opener = FileOpener(options.hard_encoding_detection,
options.quiet_level)
glob_match = GlobMatch(options.skip)
bad_count = 0
for filename in options.files:
# ignore hidden files
if is_hidden(filename, options.check_hidden):
continue
if os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
if glob_match.match(root): # skip (absolute) directories
del dirs[:]
continue
for file_ in files:
if glob_match.match(file_): # skip files
continue
fname = os.path.join(root, file_)
if glob_match.match(fname): # skip paths
continue
if not os.path.isfile(fname) or not os.path.getsize(fname):
continue
bad_count += parse_file(
fname, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options)
# skip (relative) directories
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
else:
bad_count += parse_file(
filename, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options)
if summary:
print("\n-------8<-------\nSUMMARY:")
print(summary)
return bad_count