Files
codespell/codespell_lib/_codespell.py
2020-04-19 18:58:23 +01:00

741 lines
27 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
"""
Copyright (C) 2010-2011 Lucas De Marchi <lucas.de.marchi@gmail.com>
Copyright (C) 2011 ProFUSION embedded systems
"""
from __future__ import print_function
import argparse
import codecs
import fnmatch
import os
import re
import sys
word_regex_def = u"[\\w\\-'`]+"
encodings = ('utf-8', 'iso-8859-1')
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
"""
VERSION = '1.17.0.dev0'
# Users might want to link this file into /usr/local/bin, so we resolve the
# symbolic link path to the real path if necessary.
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
_builtin_dictionaries = ( # name, desc, name, err in aspell, correction in aspell # noqa: E501
# The aspell tests here aren't the ideal state, but the None's are realistic
# for obscure words
('clear', 'for unambiguous errors', '', False, None),
('rare', 'for rare but valid words', '_rare', True, None),
('informal', 'for informal words', '_informal', True, True),
('code', 'for words common to code and/or mathematics', '_code', None, None), # noqa: E501
('names', 'for valid proper names that might be typos', '_names', None, None), # noqa: E501
('en-GB_to_en-US', 'for corrections from en-GB to en-US', '_en-GB_to_en-US', True, True), # noqa: E501
)
_builtin_default = 'clear,rare'
# OPTIONS:
#
# ARGUMENTS:
# dict_filename The file containing the dictionary of misspellings.
# If set to '-', it will be read from stdin
# file1 .. fileN Files to check spelling
class QuietLevels(object):
NONE = 0
ENCODING = 1
BINARY_FILE = 2
DISABLED_FIXES = 4
NON_AUTOMATIC_FIXES = 8
FIXES = 16
class GlobMatch(object):
def __init__(self, pattern):
if pattern:
# Pattern might be a list of comma-delimited strings
self.pattern_list = ','.join(pattern).split(',')
else:
self.pattern_list = None
def match(self, filename):
if self.pattern_list is None:
return False
for p in self.pattern_list:
if fnmatch.fnmatch(filename, p):
return True
return False
class Misspelling(object):
def __init__(self, data, fix, reason):
self.data = data
self.fix = fix
self.reason = reason
class TermColors(object):
def __init__(self):
self.FILE = '\033[33m'
self.WWORD = '\033[31m'
self.FWORD = '\033[32m'
self.DISABLE = '\033[0m'
def disable(self):
self.FILE = ''
self.WWORD = ''
self.FWORD = ''
self.DISABLE = ''
class Summary(object):
def __init__(self):
self.summary = {}
def update(self, wrongword):
if wrongword in self.summary:
self.summary[wrongword] += 1
else:
self.summary[wrongword] = 1
def __str__(self):
keys = list(self.summary.keys())
keys.sort()
return "\n".join(["{0}{1:{width}}".format(
key,
self.summary.get(key),
width=15 - len(key)) for key in keys])
class FileOpener(object):
def __init__(self, use_chardet, quiet_level):
self.use_chardet = use_chardet
if use_chardet:
self.init_chardet()
self.quiet_level = quiet_level
def init_chardet(self):
try:
from chardet.universaldetector import UniversalDetector
except ImportError:
raise ImportError("There's no chardet installed to import from. "
"Please, install it and check your PYTHONPATH "
"environment variable")
self.encdetector = UniversalDetector()
def open(self, filename):
if self.use_chardet:
return self.open_with_chardet(filename)
else:
return self.open_with_internal(filename)
def open_with_chardet(self, filename):
self.encdetector.reset()
with codecs.open(filename, 'rb') as f:
for line in f:
self.encdetector.feed(line)
if self.encdetector.done:
break
self.encdetector.close()
encoding = self.encdetector.result['encoding']
try:
f = codecs.open(filename, 'r', encoding=encoding)
except UnicodeDecodeError:
print('ERROR: Could not detect encoding: %s' % filename,
file=sys.stderr)
raise
except LookupError:
print('ERROR: %s -- Don\'t know how to handle encoding %s'
% (filename, encoding), file=sys.stderr)
raise
else:
lines = f.readlines()
f.close()
return lines, encoding
def open_with_internal(self, filename):
curr = 0
while True:
try:
f = codecs.open(filename, 'r', encoding=encodings[curr])
except UnicodeDecodeError:
if not self.quiet_level & QuietLevels.ENCODING:
print('WARNING: Decoding file %s' % filename,
file=sys.stderr)
print('WARNING: using encoding=%s failed. '
% encodings[curr], file=sys.stderr)
try:
print('WARNING: Trying next encoding: %s'
% encodings[curr + 1], file=sys.stderr)
except IndexError:
pass
curr += 1
else:
lines = f.readlines()
f.close()
break
if not lines:
raise Exception('Unknown encoding')
encoding = encodings[curr]
return lines, encoding
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
def parse_options(args):
parser = argparse.ArgumentParser()
parser.set_defaults(colors=sys.stdout.isatty())
parser.add_argument('--version', action='version', version=VERSION)
parser.add_argument('-d', '--disable-colors',
action='store_false', dest='colors',
help='disable colors, even when printing to terminal '
'(always set for Windows)')
parser.add_argument('-c', '--enable-colors',
action='store_true', dest='colors',
help='enable colors, even when not printing to '
'terminal')
parser.add_argument('-w', '--write-changes',
action='store_true', default=False,
help='write changes in place if possible')
parser.add_argument('-D', '--dictionary',
action='append',
help='Custom dictionary file that contains spelling '
'corrections. If this flag is not specified or '
'equals "-" then the default dictionary is used. '
'This option can be specified multiple times.')
builtin_opts = ', '.join(
'%r %s' % (d[0], d[1]) for d in _builtin_dictionaries)
parser.add_argument('--builtin',
dest='builtin', default=_builtin_default,
metavar='BUILTIN-LIST',
help='Comma-separated list of builtin dictionaries '
'to include (when "-D -" or no "-D" is passed). '
'Current options are:\n%s. The default is '
'"--builtin %s".'
% (builtin_opts, _builtin_default))
parser.add_argument('-I', '--ignore-words',
action='append', metavar='FILE',
help='File that contains words which will be ignored '
'by codespell. File must contain 1 word per line.'
' Words are case sensitive based on how they are '
'written in the dictionary file')
parser.add_argument('-L', '--ignore-words-list',
action='append', metavar='WORDS',
help='Comma separated list of words to be ignored '
'by codespell. Words are case sensitive based on '
'how they are written in the dictionary file')
parser.add_argument('-r', '--regex',
action='store', type=str,
help='Regular expression which is used to find words. '
'By default any alphanumeric character, the '
'underscore, the hyphen, and the apostrophe is '
'used to build words. This option cannot be '
'specified together with --write-changes.')
parser.add_argument('-s', '--summary',
action='store_true', default=False,
help='print summary of fixes')
parser.add_argument('-S', '--skip',
action='append',
help='Comma-separated list of files to skip. It '
'accepts globs as well. E.g.: if you want '
'codespell to skip .eps and .txt files, '
'you\'d give "*.eps,*.txt" to this option.')
parser.add_argument('-x', '--exclude-file', type=str, metavar='FILE',
help='FILE with lines that should not be changed')
parser.add_argument('-i', '--interactive',
action='store', type=int, default=0,
help='Set interactive mode when writing changes. '
'0: no interactivity. 1: ask for confirmation. '
'2 ask user to choose one fix when more than one '
'is available. 3: both 1 and 2')
parser.add_argument('-q', '--quiet-level',
action='store', type=int, default=0,
help='Bitmask that allows codespell to run quietly. '
'0: the default, in which all messages are '
'printed. 1: disable warnings about wrong '
'encoding. 2: disable warnings about binary '
'file. 4: shut down warnings about automatic '
'fixes that were disabled in dictionary. '
'8: don\'t print anything for non-automatic '
'fixes. 16: don\'t print fixed files.')
parser.add_argument('-e', '--hard-encoding-detection',
action='store_true', default=False,
help='Use chardet to detect the encoding of each '
'file. This can slow down codespell, but is more '
'reliable in detecting encodings other than '
'utf-8, iso8859-1, and ascii.')
parser.add_argument('-f', '--check-filenames',
action='store_true', default=False,
help='check file names as well')
parser.add_argument('-H', '--check-hidden',
action='store_true', default=False,
help='check hidden files (those starting with ".") as '
'well')
parser.add_argument('-A', '--after-context', type=int, metavar='LINES',
help='print LINES of trailing context')
parser.add_argument('-B', '--before-context', type=int, metavar='LINES',
help='print LINES of leading context')
parser.add_argument('-C', '--context', type=int, metavar='LINES',
help='print LINES of surrounding context')
parser.add_argument('files', nargs='*',
help='files or directories to check')
options = parser.parse_args(list(args))
if not options.files:
options.files.append('.')
return options, parser
def build_exclude_hashes(filename, exclude_lines):
with codecs.open(filename, 'r') as f:
for line in f:
exclude_lines.add(line)
def build_ignore_words(filename, ignore_words):
with codecs.open(filename, mode='r', encoding='utf-8') as f:
for line in f:
ignore_words.add(line.strip())
def build_dict(filename, misspellings, ignore_words):
with codecs.open(filename, mode='r', encoding='utf-8') as f:
for line in f:
[key, data] = line.split('->')
# TODO for now, convert both to lower. Someday we can maybe add
# support for fixing caps.
key = key.lower()
data = data.lower()
if key in ignore_words:
continue
data = data.strip()
fix = data.rfind(',')
if fix < 0:
fix = True
reason = ''
elif fix == (len(data) - 1):
data = data[:fix]
reason = ''
fix = False
else:
reason = data[fix + 1:].strip()
data = data[:fix]
fix = False
misspellings[key] = Misspelling(data, fix, reason)
def is_hidden(filename, check_hidden):
bfilename = os.path.basename(filename)
return bfilename not in ('', '.', '..') and \
(not check_hidden and bfilename[0] == '.')
def is_text_file(filename):
with open(filename, mode='rb') as f:
s = f.read(1024)
if b'\x00' in s:
return False
return True
def fix_case(word, fixword):
if word == word.capitalize():
return fixword.capitalize()
elif word == word.upper():
return fixword.upper()
# they are both lower case
# or we don't have any idea
return fixword
def ask_for_word_fix(line, wrongword, misspelling, interactivity):
if interactivity <= 0:
return misspelling.fix, fix_case(wrongword, misspelling.data)
if misspelling.fix and interactivity & 1:
r = ''
fixword = fix_case(wrongword, misspelling.data)
while not r:
print("%s\t%s ==> %s (Y/n) " % (line, wrongword, fixword), end='')
r = sys.stdin.readline().strip().upper()
if not r:
r = 'Y'
if r != 'Y' and r != 'N':
print("Say 'y' or 'n'")
r = ''
if r == 'N':
misspelling.fix = False
misspelling.fixword = ''
elif (interactivity & 2) and not misspelling.reason:
# if it is not disabled, i.e. it just has more than one possible fix,
# we ask the user which word to use
r = ''
opt = list(map(lambda x: x.strip(), misspelling.data.split(',')))
while not r:
print("%s Choose an option (blank for none): " % line, end='')
for i in range(len(opt)):
fixword = fix_case(wrongword, opt[i])
print(" %d) %s" % (i, fixword), end='')
print(": ", end='')
sys.stdout.flush()
n = sys.stdin.readline().strip()
if not n:
break
try:
n = int(n)
r = opt[n]
except (ValueError, IndexError):
print("Not a valid option\n")
if r:
misspelling.fix = True
misspelling.data = r
return misspelling.fix, fix_case(wrongword, misspelling.data)
def print_context(lines, index, context):
# context = (context_before, context_after)
for i in range(index - context[0], index + context[1] + 1):
if 0 <= i < len(lines):
print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
def parse_file(filename, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options):
bad_count = 0
lines = None
changed = False
encoding = encodings[0] # if not defined, use UTF-8
if filename == '-':
f = sys.stdin
lines = f.readlines()
else:
# ignore binary files
if not os.path.isfile(filename):
return 0
if options.check_filenames:
for word in word_regex.findall(filename):
lword = word.lower()
if lword not in misspellings:
continue
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if summary and fix:
summary.update(lword)
cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
cwrongword = "%s%s%s" % (colors.WWORD, word, colors.DISABLE)
crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
if misspellings[lword].reason:
if options.quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = " | %s%s%s" % (colors.FILE,
misspellings[lword].reason,
colors.DISABLE)
else:
if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ''
bad_count += 1
print("%(FILENAME)s: %(WRONGWORD)s "
" ==> %(RIGHTWORD)s%(REASON)s"
% {'FILENAME': cfilename,
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
text = is_text_file(filename)
if not text:
if not options.quiet_level & QuietLevels.BINARY_FILE:
print("WARNING: Binary file: %s " % filename, file=sys.stderr)
return 0
try:
lines, encoding = file_opener.open(filename)
except Exception:
return 0
for i, line in enumerate(lines):
if line in exclude_lines:
continue
fixed_words = set()
asked_for = set()
for word in word_regex.findall(line):
lword = word.lower()
if lword in misspellings:
context_shown = False
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if options.interactive and lword not in asked_for:
if context is not None:
context_shown = True
print_context(lines, i, context)
fix, fixword = ask_for_word_fix(
lines[i], word, misspellings[lword],
options.interactive)
asked_for.add(lword)
if summary and fix:
summary.update(lword)
if word in fixed_words: # can skip because of re.sub below
continue
if options.write_changes and fix:
changed = True
lines[i] = re.sub(r'\b%s\b' % word, fixword, lines[i])
fixed_words.add(word)
continue
# otherwise warning was explicitly set by interactive mode
if (options.interactive & 2 and not fix and not
misspellings[lword].reason):
continue
cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
cline = "%s%d%s" % (colors.FILE, i + 1, colors.DISABLE)
cwrongword = "%s%s%s" % (colors.WWORD, word, colors.DISABLE)
crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
if misspellings[lword].reason:
if options.quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = " | %s%s%s" % (colors.FILE,
misspellings[lword].reason,
colors.DISABLE)
else:
if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ''
# If we get to this point (uncorrected error) we should change
# our bad_count and thus return value
bad_count += 1
if (not context_shown) and (context is not None):
print_context(lines, i, context)
if filename != '-':
print("%(FILENAME)s:%(LINE)s: %(WRONGWORD)s "
" ==> %(RIGHTWORD)s%(REASON)s"
% {'FILENAME': cfilename, 'LINE': cline,
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
else:
print('%(LINE)s: %(STRLINE)s\n\t%(WRONGWORD)s '
'==> %(RIGHTWORD)s%(REASON)s'
% {'LINE': cline, 'STRLINE': line.strip(),
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
if changed:
if filename == '-':
print("---")
for line in lines:
print(line, end='')
else:
if not options.quiet_level & QuietLevels.FIXES:
print("%sFIXED:%s %s"
% (colors.FWORD, colors.DISABLE, filename),
file=sys.stderr)
with codecs.open(filename, 'w', encoding=encoding) as f:
f.writelines(lines)
return bad_count
def _script_main():
"""Wrap to main() for setuptools."""
return main(*sys.argv[1:])
def main(*args):
"""Contains flow control"""
options, parser = parse_options(args)
if options.regex and options.write_changes:
print('ERROR: --write-changes cannot be used together with '
'--regex')
parser.print_help()
return 1
word_regex = options.regex or word_regex_def
try:
word_regex = re.compile(word_regex)
except re.error as err:
print('ERROR: invalid regular expression "%s" (%s)' %
(word_regex, err), file=sys.stderr)
parser.print_help()
return 1
ignore_words_files = options.ignore_words or []
ignore_words = set()
for ignore_words_file in ignore_words_files:
if not os.path.isfile(ignore_words_file):
print('ERROR: cannot find ignore-words file: %s' %
ignore_words_file, file=sys.stderr)
parser.print_help()
return 1
build_ignore_words(ignore_words_file, ignore_words)
ignore_words_list = options.ignore_words_list or []
for comma_separated_words in ignore_words_list:
for word in comma_separated_words.split(','):
ignore_words.add(word.strip())
if options.dictionary:
dictionaries = options.dictionary
else:
dictionaries = ['-']
use_dictionaries = list()
for dictionary in dictionaries:
if dictionary == "-":
# figure out which builtin dictionaries to use
use = sorted(set(options.builtin.split(',')))
for u in use:
for builtin in _builtin_dictionaries:
if builtin[0] == u:
use_dictionaries.append(
os.path.join(_data_root, 'dictionary%s.txt'
% (builtin[2],)))
break
else:
print('ERROR: Unknown builtin dictionary: %s' % (u,),
file=sys.stderr)
parser.print_help()
return 1
else:
if not os.path.isfile(dictionary):
print('ERROR: cannot find dictionary file: %s' % dictionary,
file=sys.stderr)
parser.print_help()
return 1
use_dictionaries.append(dictionary)
misspellings = dict()
for dictionary in use_dictionaries:
build_dict(dictionary, misspellings, ignore_words)
colors = TermColors()
if not options.colors or sys.platform == 'win32':
colors.disable()
if options.summary:
summary = Summary()
else:
summary = None
context = None
if options.context is not None:
if (options.before_context is not None) or \
(options.after_context is not None):
print('ERROR: --context/-C cannot be used together with '
'--context-before/-B or --context-after/-A')
parser.print_help()
return 1
context_both = max(0, options.context)
context = (context_both, context_both)
elif (options.before_context is not None) or \
(options.after_context is not None):
context_before = 0
context_after = 0
if options.before_context is not None:
context_before = max(0, options.before_context)
if options.after_context is not None:
context_after = max(0, options.after_context)
context = (context_before, context_after)
exclude_lines = set()
if options.exclude_file:
build_exclude_hashes(options.exclude_file, exclude_lines)
file_opener = FileOpener(options.hard_encoding_detection,
options.quiet_level)
glob_match = GlobMatch(options.skip)
bad_count = 0
for filename in options.files:
# ignore hidden files
if is_hidden(filename, options.check_hidden):
continue
if os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
if glob_match.match(root): # skip (absolute) directories
del dirs[:]
continue
for file_ in files:
if glob_match.match(file_): # skip files
continue
fname = os.path.join(root, file_)
if glob_match.match(fname): # skip paths
continue
if not os.path.isfile(fname) or not os.path.getsize(fname):
continue
bad_count += parse_file(
fname, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options)
# skip (relative) directories
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
else:
bad_count += parse_file(
filename, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options)
if summary:
print("\n-------8<-------\nSUMMARY:")
print(summary)
return bad_count