Files
codespell/codespell_lib/_codespell.py
Philipp Hahn b91f7c76b4 Skip directories early
The current code already allows skipping directories, but the full path
must be given, e.g. "./.git" instead of just ".git", as "codespell ." is
the default.

As the name of the directory is constant for all files within, move the
check for the directory before the loop.

Also skip descending.
2017-08-17 16:20:48 +02:00

600 lines
20 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
"""
Copyright (C) 2010-2011 Lucas De Marchi <lucas.de.marchi@gmail.com>
Copyright (C) 2011 ProFUSION embedded systems
"""
from __future__ import print_function
import codecs
import sys
import re
from optparse import OptionParser
import os
import fnmatch
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
"""
VERSION = '1.11.0.dev0'
misspellings = {}
ignore_words = set()
exclude_lines = set()
options = None
file_opener = None
quiet_level = 0
encodings = ['utf-8', 'iso-8859-1']
word_regex = re.compile(r"[\w\-']+")
# Users might want to link this file into /usr/local/bin, so we resolve the
# symbolic link path to the real path if necessary.
default_dictionary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'data', 'dictionary.txt')
# OPTIONS:
#
# ARGUMENTS:
# dict_filename The file containing the dictionary of misspellings.
# If set to '-', it will be read from stdin
# file1 .. fileN Files to check spelling
class QuietLevels(object):
NONE = 0
ENCODING = 1
BINARY_FILE = 2
DISABLED_FIXES = 4
NON_AUTOMATIC_FIXES = 8
FIXES = 16
class GlobMatch(object):
def __init__(self, pattern):
if pattern:
self.pattern_list = pattern.split(',')
else:
self.pattern_list = None
def match(self, filename):
if self.pattern_list is None:
return False
for p in self.pattern_list:
if fnmatch.fnmatch(filename, p):
return True
return False
class Misspelling(object):
def __init__(self, data, fix, reason):
self.data = data
self.fix = fix
self.reason = reason
class TermColors(object):
def __init__(self):
self.FILE = '\033[33m'
self.WWORD = '\033[31m'
self.FWORD = '\033[32m'
self.DISABLE = '\033[0m'
def disable(self):
self.FILE = ''
self.WWORD = ''
self.FWORD = ''
self.DISABLE = ''
class Summary(object):
def __init__(self):
self.summary = {}
def update(self, wrongword):
if wrongword in self.summary:
self.summary[wrongword] += 1
else:
self.summary[wrongword] = 1
def __str__(self):
keys = list(self.summary.keys())
keys.sort()
return "\n".join(["{0}{1:{width}}".format(
key,
self.summary.get(key),
width=15 - len(key)) for key in keys])
class FileOpener(object):
def __init__(self, use_chardet):
self.use_chardet = use_chardet
if use_chardet:
self.init_chardet()
def init_chardet(self):
try:
from chardet.universaldetector import UniversalDetector
except ImportError:
raise ImportError("There's no chardet installed to import from. "
"Please, install it and check your PYTHONPATH "
"environment variable")
self.encdetector = UniversalDetector()
def open(self, filename):
if self.use_chardet:
return self.open_with_chardet(filename)
else:
return self.open_with_internal(filename)
def open_with_chardet(self, filename):
self.encdetector.reset()
with codecs.open(filename, 'rb') as f:
for line in f:
self.encdetector.feed(line)
if self.encdetector.done:
break
self.encdetector.close()
encoding = self.encdetector.result['encoding']
try:
f = codecs.open(filename, 'r', encoding=encoding)
except UnicodeDecodeError:
print('ERROR: Could not detect encoding: %s' % filename,
file=sys.stderr)
raise
except LookupError:
print('ERROR: %s -- Don\'t know how to handle encoding %s'
% (filename, encoding), file=sys.stderr)
raise
else:
lines = f.readlines()
f.close()
return lines, encoding
def open_with_internal(self, filename):
curr = 0
global encodings
while True:
try:
f = codecs.open(filename, 'r', encoding=encodings[curr])
except UnicodeDecodeError:
if not quiet_level & QuietLevels.ENCODING:
print('WARNING: Decoding file %s' % filename,
file=sys.stderr)
print('WARNING: using encoding=%s failed. '
% encodings[curr], file=sys.stderr)
try:
print('WARNING: Trying next encoding: %s'
% encodings[curr + 1], file=sys.stderr)
except IndexError:
pass
curr += 1
else:
lines = f.readlines()
f.close()
break
if not lines:
raise Exception('Unknown encoding')
encoding = encodings[curr]
return lines, encoding
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
def parse_options(args):
parser = OptionParser(usage=USAGE, version=VERSION)
parser.set_defaults(colors=sys.stdout.isatty())
parser.add_option('-d', '--disable-colors',
action='store_false', dest='colors',
help='disable colors even when printing to terminal')
parser.add_option('-c', '--enable-colors',
action='store_true', dest='colors',
help='enable colors even when not printing to terminal')
parser.add_option('-w', '--write-changes',
action='store_true', default=False,
help='write changes in place if possible')
parser.add_option('-D', '--dictionary',
action='append', metavar='FILE',
help='Custom dictionary file that contains spelling '
'corrections. If this flag is not specified or '
'equals "-" then default dictionary "%s" is used. '
'This option can be specified multiple times.' %
default_dictionary)
parser.add_option('-I', '--ignore-words',
action='append', metavar='FILE',
help='File that contains words which will be ignored '
'by codespell.')
parser.add_option('-r', '--regex',
action='store', type='string',
help='Regular expression which is used to find words. '
'By default any alphanumeric character, the '
'underscore, the hyphen, and the apostrophe is '
'used to build words (i.e. %s). This option cannot '
'be specified together with the write-changes '
'functionality. ' % word_regex.pattern)
parser.add_option('-s', '--summary',
action='store_true', default=False,
help='print summary of fixes')
parser.add_option('-S', '--skip',
help='Comma-separated list of files to skip. It '
'accepts globs as well. E.g.: if you want '
'codespell to skip .eps and .txt files, '
'you\'d give "*.eps,*.txt" to this option.')
parser.add_option('-x', '--exclude-file',
help='FILE with lines that should not be changed',
metavar='FILE')
parser.add_option('-i', '--interactive',
action='store', type='int', default=0,
help='Set interactive mode when writing changes. '
'0 is the same of no interactivity; 1 makes '
'codespell ask confirmation; 2 ask user to '
'choose one fix when more than one is '
'available; 3 applies both 1 and 2')
parser.add_option('-q', '--quiet-level',
action='store', type='int', default=0,
help='Bitmask that allows codespell to run quietly. '
'0: the default, in which all messages are '
'printed. 1: disable warnings about wrong '
'encoding. 2: disable warnings about binary '
'file. 4: shut down warnings about automatic '
'fixes that were disabled in dictionary. '
'8: don\'t print anything for non-automatic '
'fixes. 16: don\'t print fixed files.')
parser.add_option('-e', '--hard-encoding-detection',
action='store_true', default=False,
help='Use chardet to detect the encoding of each '
'file. This can slow down codespell, but is more '
'reliable in detecting encodings other than utf-8, '
'iso8859-1 and ascii.')
(o, args) = parser.parse_args(list(args))
if not args:
args.append('.')
return o, args, parser
def build_exclude_hashes(filename):
with codecs.open(filename, 'r') as f:
for line in f:
exclude_lines.add(line)
def build_ignore_words(filename):
with codecs.open(filename, mode='r', buffering=1, encoding='utf-8') as f:
for line in f:
ignore_words.add(line.strip())
def build_dict(filename):
with codecs.open(filename, mode='r', buffering=1, encoding='utf-8') as f:
for line in f:
[key, data] = line.split('->')
if key in ignore_words:
continue
data = data.strip()
fix = data.rfind(',')
if fix < 0:
fix = True
reason = ''
elif fix == (len(data) - 1):
data = data[:fix]
reason = ''
fix = False
else:
reason = data[fix + 1:].strip()
data = data[:fix]
fix = False
misspellings[key] = Misspelling(data, fix, reason)
def is_hidden(filename):
bfilename = os.path.basename(filename)
if bfilename != '' and bfilename != '.' and bfilename != '..' \
and bfilename[0] == '.':
return True
return False
def is_text_file(filename):
with open(filename, mode='rb') as f:
s = f.read(1024)
if b'\x00' in s:
return False
return True
def fix_case(word, fixword):
if word == word.capitalize():
return fixword.capitalize()
elif word == word.upper():
return fixword.upper()
# they are both lower case
# or we don't have any idea
return fixword
def ask_for_word_fix(line, wrongword, misspelling, interactivity):
if interactivity <= 0:
return misspelling.fix, fix_case(wrongword, misspelling.data)
if misspelling.fix and interactivity & 1:
r = ''
fixword = fix_case(wrongword, misspelling.data)
while not r:
print("%s\t%s ==> %s (Y/n) " % (line, wrongword, fixword), end='')
r = sys.stdin.readline().strip().upper()
if not r:
r = 'Y'
if r != 'Y' and r != 'N':
print("Say 'y' or 'n'")
r = ''
if r == 'N':
misspelling.fix = False
misspelling.fixword = ''
elif (interactivity & 2) and not misspelling.reason:
# if it is not disabled, i.e. it just has more than one possible fix,
# we ask the user which word to use
r = ''
opt = list(map(lambda x: x.strip(), misspelling.data.split(',')))
while not r:
print("%s Choose an option (blank for none): " % line, end='')
for i in range(len(opt)):
fixword = fix_case(wrongword, opt[i])
print(" %d) %s" % (i, fixword), end='')
print(": ", end='')
sys.stdout.flush()
n = sys.stdin.readline().strip()
if not n:
break
try:
n = int(n)
r = opt[n]
except (ValueError, IndexError):
print("Not a valid option\n")
if r:
misspelling.fix = True
misspelling.data = r
return misspelling.fix, fix_case(wrongword, misspelling.data)
def parse_file(filename, colors, summary):
lines = None
changed = False
global misspellings
global options
global encodings
global quiet_level
encoding = encodings[0] # if not defined, use UTF-8
if filename == '-':
f = sys.stdin
lines = f.readlines()
else:
# ignore binary files
if not os.path.isfile(filename):
return 0
text = is_text_file(filename)
if not text:
if not quiet_level & QuietLevels.BINARY_FILE:
print("WARNING: Binary file: %s " % filename, file=sys.stderr)
return 0
try:
lines, encoding = file_opener.open(filename)
except Exception:
return 0
bad_count = 0
for i, line in enumerate(lines):
if line in exclude_lines:
continue
fixed_words = set()
asked_for = set()
for word in word_regex.findall(line):
lword = word.lower()
if lword in misspellings:
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if options.interactive and lword not in asked_for:
fix, fixword = ask_for_word_fix(lines[i], word,
misspellings[lword],
options.interactive)
asked_for.add(lword)
if summary and fix:
summary.update(lword)
if word in fixed_words: # can skip because of re.sub below
continue
if options.write_changes and fix:
changed = True
lines[i] = re.sub(r'\b%s\b' % word, fixword, lines[i])
fixed_words.add(word)
continue
# otherwise warning was explicitly set by interactive mode
if (options.interactive & 2 and not fix and not
misspellings[lword].reason):
continue
cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
cline = "%s%d%s" % (colors.FILE, i + 1, colors.DISABLE)
cwrongword = "%s%s%s" % (colors.WWORD, word, colors.DISABLE)
crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
if misspellings[lword].reason:
if quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = " | %s%s%s" % (colors.FILE,
misspellings[lword].reason,
colors.DISABLE)
else:
if quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ''
# If we get to this point (uncorrected error) we should change
# our bad_count and thus return value
bad_count += 1
if filename != '-':
print("%(FILENAME)s:%(LINE)s: %(WRONGWORD)s "
" ==> %(RIGHTWORD)s%(REASON)s"
% {'FILENAME': cfilename, 'LINE': cline,
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
else:
print('%(LINE)s: %(STRLINE)s\n\t%(WRONGWORD)s '
'==> %(RIGHTWORD)s%(REASON)s'
% {'LINE': cline, 'STRLINE': line.strip(),
'WRONGWORD': cwrongword,
'RIGHTWORD': crightword, 'REASON': creason})
if changed:
if filename == '-':
print("---")
for line in lines:
print(line, end='')
else:
if not quiet_level & QuietLevels.FIXES:
print("%sFIXED:%s %s"
% (colors.FWORD, colors.DISABLE, filename),
file=sys.stderr)
with codecs.open(filename, 'w', encoding=encoding) as f:
f.writelines(lines)
return bad_count
def main(*args):
"""Contains flow control"""
global options
global quiet_level
global file_opener
options, args, parser = parse_options(args)
if options.regex:
if options.write_changes:
parser.error('--write-changes cannot be used together with '
'--regex')
global word_regex
try:
word_regex = re.compile(options.regex)
except re.error as err:
print('ERROR: invalid regular expression "%s" (%s)' %
(options.regex, err), file=sys.stderr)
parser.print_help()
return 1
ignore_words_files = options.ignore_words or []
for ignore_words_file in ignore_words_files:
if not os.path.exists(ignore_words_file):
print('ERROR: cannot find ignore-words file: %s' %
ignore_words_file, file=sys.stderr)
parser.print_help()
return 1
build_ignore_words(ignore_words_file)
dictionaries = options.dictionary or [default_dictionary]
for dictionary in dictionaries:
if dictionary is "-":
dictionary = default_dictionary
if not os.path.exists(dictionary):
print('ERROR: cannot find dictionary file: %s' % dictionary,
file=sys.stderr)
parser.print_help()
return 1
build_dict(dictionary)
colors = TermColors()
if not options.colors:
colors.disable()
if options.summary:
summary = Summary()
else:
summary = None
if options.exclude_file:
build_exclude_hashes(options.exclude_file)
if options.quiet_level:
quiet_level = options.quiet_level
file_opener = FileOpener(options.hard_encoding_detection)
glob_match = GlobMatch(options.skip)
bad_count = 0
for filename in args:
# ignore hidden files
if is_hidden(filename):
continue
if os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
if glob_match.match(root): # skip (absolute) directories
del dirs[:]
continue
for file_ in files:
fname = os.path.join(root, file_)
if not os.path.isfile(fname) or not os.path.getsize(fname):
continue
if glob_match.match(file_):
continue
bad_count += parse_file(fname, colors, summary)
else:
bad_count += parse_file(filename, colors, summary)
if summary:
print("\n-------8<-------\nSUMMARY:")
print(summary)
return bad_count