#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html. """ Copyright (C) 2010-2011 Lucas De Marchi Copyright (C) 2011 ProFUSION embedded systems """ from __future__ import print_function import codecs import sys import re from optparse import OptionParser import os import fnmatch USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] """ VERSION = '1.10.0.dev0' misspellings = {} exclude_lines = set() options = None file_opener = None quiet_level = 0 encodings = ['utf-8', 'iso-8859-1'] # Users might want to link this file into /usr/local/bin, so we resolve the # symbolic link path to the real path if necessary. default_dictionary = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'dictionary.txt') # OPTIONS: # # ARGUMENTS: # dict_filename The file containing the dictionary of misspellings. # If set to '-', it will be read from stdin # file1 .. fileN Files to check spelling class QuietLevels(object): NONE = 0 ENCODING = 1 BINARY_FILE = 2 DISABLED_FIXES = 4 NON_AUTOMATIC_FIXES = 8 FIXES = 16 class GlobMatch(object): def __init__(self, pattern): if pattern: self.pattern_list = pattern.split(',') else: self.pattern_list = None def match(self, filename): if self.pattern_list is None: return False for p in self.pattern_list: if fnmatch.fnmatch(filename, p): return True return False class Misspelling(object): def __init__(self, data, fix, reason): self.data = data self.fix = fix self.reason = reason class TermColors(object): def __init__(self): self.FILE = '\033[33m' self.WWORD = '\033[31m' self.FWORD = '\033[32m' self.DISABLE = '\033[0m' def disable(self): self.FILE = '' self.WWORD = '' self.FWORD = '' self.DISABLE = '' class Summary(object): def __init__(self): self.summary = {} def update(self, wrongword): if wrongword in self.summary: self.summary[wrongword] += 1 else: self.summary[wrongword] = 1 def __str__(self): keys = list(self.summary.keys()) keys.sort() return "\n".join(["{0}{1:{width}}".format( key, self.summary.get(key), width=15 - len(key)) for key in keys]) class FileOpener(object): def __init__(self, use_chardet): self.use_chardet = use_chardet if use_chardet: self.init_chardet() def init_chardet(self): try: from chardet.universaldetector import UniversalDetector except ImportError: raise ImportError("There's no chardet installed to import from. " "Please, install it and check your PYTHONPATH " "environment variable") self.encdetector = UniversalDetector() def open(self, filename): if self.use_chardet: return self.open_with_chardet(filename) else: return self.open_with_internal(filename) def open_with_chardet(self, filename): self.encdetector.reset() with codecs.open(filename, 'rb') as f: for line in f: self.encdetector.feed(line) if self.encdetector.done: break self.encdetector.close() encoding = self.encdetector.result['encoding'] try: f = codecs.open(filename, 'r', encoding=encoding) except UnicodeDecodeError: print('ERROR: Could not detect encoding: %s' % filename, file=sys.stderr) raise except LookupError: print('ERROR: %s -- Don\'t know how to handle encoding %s' % (filename, encoding), file=sys.stderr) raise else: lines = f.readlines() f.close() return lines, encoding def open_with_internal(self, filename): curr = 0 global encodings while True: try: f = codecs.open(filename, 'r', encoding=encodings[curr]) except UnicodeDecodeError: if not quiet_level & QuietLevels.ENCODING: print('WARNING: Decoding file %s' % filename, file=sys.stderr) print('WARNING: using encoding=%s failed. ' % encodings[curr], file=sys.stderr) try: print('WARNING: Trying next encoding: %s' % encodings[curr + 1], file=sys.stderr) except IndexError: pass curr += 1 else: lines = f.readlines() f.close() break if not lines: raise Exception('Unknown encoding') encoding = encodings[curr] return lines, encoding # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:- def parse_options(args): parser = OptionParser(usage=USAGE, version=VERSION) parser.set_defaults(colors=sys.stdout.isatty()) parser.add_option('-d', '--disable-colors', action='store_false', dest='colors', help='disable colors even when printing to terminal') parser.add_option('-c', '--enable-colors', action='store_true', dest='colors', help='enable colors even when not printing to terminal') parser.add_option('-w', '--write-changes', action='store_true', default=False, help='write changes in place if possible') parser.add_option('-D', '--dictionary', action='append', metavar='FILE', help='Custom dictionary file that contains spelling ' 'corrections. If this flag is not specified or ' 'equals "-" then default dictionary "%s" is used. ' 'This option can be specified multiple times.' % default_dictionary) parser.add_option('-s', '--summary', action='store_true', default=False, help='print summary of fixes') parser.add_option('-S', '--skip', help='Comma-separated list of files to skip. It ' 'accepts globs as well. E.g.: if you want ' 'codespell to skip .eps and .txt files, ' 'you\'d give "*.eps,*.txt" to this option.') parser.add_option('-x', '--exclude-file', help='FILE with lines that should not be changed', metavar='FILE') parser.add_option('-i', '--interactive', action='store', type='int', default=0, help='Set interactive mode when writing changes. ' '0 is the same of no interactivity; 1 makes ' 'codespell ask confirmation; 2 ask user to ' 'choose one fix when more than one is ' 'available; 3 applies both 1 and 2') parser.add_option('-q', '--quiet-level', action='store', type='int', default=0, help='Bitmask that allows codespell to run quietly. ' '0: the default, in which all messages are ' 'printed. 1: disable warnings about wrong ' 'encoding. 2: disable warnings about binary ' 'file. 4: shut down warnings about automatic ' 'fixes that were disabled in dictionary. ' '8: don\'t print anything for non-automatic ' 'fixes. 16: don\'t print fixed files.') parser.add_option('-e', '--hard-encoding-detection', action='store_true', default=False, help='Use chardet to detect the encoding of each ' 'file. This can slow down codespell, but is more ' 'reliable in detecting encodings other than utf-8, ' 'iso8859-1 and ascii.') (o, args) = parser.parse_args(list(args)) if not args: args.append('.') return o, args, parser def build_exclude_hashes(filename): with codecs.open(filename, 'r') as f: for line in f: exclude_lines.add(line) def build_dict(filename): with codecs.open(filename, mode='r', buffering=1, encoding='utf-8') as f: for line in f: [key, data] = line.split('->') data = data.strip() fix = data.rfind(',') if fix < 0: fix = True reason = '' elif fix == (len(data) - 1): data = data[:fix] reason = '' fix = False else: reason = data[fix + 1:].strip() data = data[:fix] fix = False misspellings[key] = Misspelling(data, fix, reason) def is_hidden(filename): bfilename = os.path.basename(filename) if bfilename != '' and bfilename != '.' and bfilename != '..' \ and bfilename[0] == '.': return True return False def is_text_file(filename): with open(filename, mode='rb') as f: s = f.read(1024) if b'\x00' in s: return False return True def fix_case(word, fixword): if word == word.capitalize(): return fixword.capitalize() elif word == word.upper(): return fixword.upper() # they are both lower case # or we don't have any idea return fixword def ask_for_word_fix(line, wrongword, misspelling, interactivity): if interactivity <= 0: return misspelling.fix, fix_case(wrongword, misspelling.data) if misspelling.fix and interactivity & 1: r = '' fixword = fix_case(wrongword, misspelling.data) while not r: print("%s\t%s ==> %s (Y/n) " % (line, wrongword, fixword), end='') r = sys.stdin.readline().strip().upper() if not r: r = 'Y' if r != 'Y' and r != 'N': print("Say 'y' or 'n'") r = '' if r == 'N': misspelling.fix = False misspelling.fixword = '' elif (interactivity & 2) and not misspelling.reason: # if it is not disabled, i.e. it just has more than one possible fix, # we ask the user which word to use r = '' opt = list(map(lambda x: x.strip(), misspelling.data.split(','))) while not r: print("%s Choose an option (blank for none): " % line, end='') for i in range(len(opt)): fixword = fix_case(wrongword, opt[i]) print(" %d) %s" % (i, fixword), end='') print(": ", end='') sys.stdout.flush() n = sys.stdin.readline().strip() if not n: break try: n = int(n) r = opt[n] except (ValueError, IndexError): print("Not a valid option\n") if r: misspelling.fix = True misspelling.data = r return misspelling.fix, fix_case(wrongword, misspelling.data) def parse_file(filename, colors, summary): lines = None changed = False global misspellings global options global encodings global quiet_level encoding = encodings[0] # if not defined, use UTF-8 if filename == '-': f = sys.stdin lines = f.readlines() else: # ignore binary files if not os.path.isfile(filename): return 0 text = is_text_file(filename) if not text: if not quiet_level & QuietLevels.BINARY_FILE: print("WARNING: Binary file: %s " % filename, file=sys.stderr) return 0 try: lines, encoding = file_opener.open(filename) except Exception: return 0 bad_count = 0 rx = re.compile(r"[\w\-']+") for i, line in enumerate(lines): if line in exclude_lines: continue fixed_words = set() asked_for = set() for word in rx.findall(line): lword = word.lower() if lword in misspellings: fix = misspellings[lword].fix fixword = fix_case(word, misspellings[lword].data) if options.interactive and lword not in asked_for: fix, fixword = ask_for_word_fix(lines[i], word, misspellings[lword], options.interactive) asked_for.add(lword) if summary and fix: summary.update(lword) if word in fixed_words: # can skip because of re.sub below continue if options.write_changes and fix: changed = True lines[i] = re.sub(r'\b%s\b' % word, fixword, lines[i]) fixed_words.add(word) continue # otherwise warning was explicitly set by interactive mode if (options.interactive & 2 and not fix and not misspellings[lword].reason): continue cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE) cline = "%s%d%s" % (colors.FILE, i, colors.DISABLE) cwrongword = "%s%s%s" % (colors.WWORD, word, colors.DISABLE) crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE) if misspellings[lword].reason: if quiet_level & QuietLevels.DISABLED_FIXES: continue creason = " | %s%s%s" % (colors.FILE, misspellings[lword].reason, colors.DISABLE) else: if quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: continue creason = '' # If we get to this point (uncorrected error) we should change # our bad_count and thus return value bad_count += 1 if filename != '-': print("%(FILENAME)s:%(LINE)s: %(WRONGWORD)s " " ==> %(RIGHTWORD)s%(REASON)s" % {'FILENAME': cfilename, 'LINE': cline, 'WRONGWORD': cwrongword, 'RIGHTWORD': crightword, 'REASON': creason}) else: print('%(LINE)s: %(STRLINE)s\n\t%(WRONGWORD)s ' '==> %(RIGHTWORD)s%(REASON)s' % {'LINE': cline, 'STRLINE': line.strip(), 'WRONGWORD': cwrongword, 'RIGHTWORD': crightword, 'REASON': creason}) if changed: if filename == '-': print("---") for line in lines: print(line, end='') else: if not quiet_level & QuietLevels.FIXES: print("%sFIXED:%s %s" % (colors.FWORD, colors.DISABLE, filename), file=sys.stderr) with codecs.open(filename, 'w', encoding=encoding) as f: f.writelines(lines) return bad_count def main(*args): """Contains flow control""" global options global quiet_level global file_opener options, args, parser = parse_options(args) dictionaries = options.dictionary or [default_dictionary] for dictionary in dictionaries: if dictionary is "-": dictionary = default_dictionary if not os.path.exists(dictionary): print('ERROR: cannot find dictionary file: %s' % dictionary, file=sys.stderr) parser.print_help() return 1 build_dict(dictionary) colors = TermColors() if not options.colors: colors.disable() if options.summary: summary = Summary() else: summary = None if options.exclude_file: build_exclude_hashes(options.exclude_file) if options.quiet_level: quiet_level = options.quiet_level file_opener = FileOpener(options.hard_encoding_detection) glob_match = GlobMatch(options.skip) bad_count = 0 for filename in args: # ignore hidden files if is_hidden(filename): continue if os.path.isdir(filename): for root, dirs, files in os.walk(filename): for file_ in files: fname = os.path.join(root, file_) if not os.path.isfile(fname) or not os.path.getsize(fname): continue if glob_match.match(root): # skips also match directories continue if glob_match.match(file_): continue bad_count += parse_file(fname, colors, summary) else: bad_count += parse_file(filename, colors, summary) if summary: print("\n-------8<-------\nSUMMARY:") print(summary) return bad_count