From 1e7f5f7e6f463da5e04447ac8995e611a3bbc770 Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Fri, 28 Aug 2020 14:19:52 -0700 Subject: [PATCH] Add support for ignoring spelling mistakes in URIs specifically. --- codespell_lib/_codespell.py | 54 +++++++++++--- codespell_lib/tests/test_basic.py | 115 +++++++++++++++++++++++++++++- 2 files changed, 155 insertions(+), 14 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 83ebaa4e..2296ea94 100755 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -28,6 +28,7 @@ import sys import textwrap word_regex_def = u"[\\w\\-'’`]+" +uri_regex_def = u"\\b((?:https?|t?ftp|file|git|smb)://[^\\s'\"]*|[\\w.%+-]+@[\\w.-]+)\\b" encodings = ('utf-8', 'iso-8859-1') USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -279,7 +280,7 @@ def parse_options(args): 'patterns to ignore by treating as whitespace. ' 'When writing regexes, consider ensuring there ' 'are boundary non-word chars, e.g., ' - '"\\Wmatch\\W". Defaults to empty/disabled.') + '"\\bmatch\\b". Defaults to empty/disabled.') parser.add_argument('-I', '--ignore-words', action='append', metavar='FILE', help='file that contains words which will be ignored ' @@ -291,6 +292,13 @@ def parse_options(args): help='comma separated list of words to be ignored ' 'by codespell. Words are case sensitive based on ' 'how they are written in the dictionary file') + parser.add_argument('--uri-ignore-words-list', + action='append', metavar='WORDS', + help='comma separated list of words to be ignored ' + 'by codespell in URIs and emails only. Words are ' + 'case sensitive based on how they are written in ' + 'the dictionary file. If set to "*", all ' + 'misspelling in URIs and emails will be ignored.') parser.add_argument('-r', '--regex', action='store', type=str, help='regular expression which is used to find words. ' @@ -373,6 +381,15 @@ def parse_options(args): return options, parser +def parse_ignore_words_option(ignore_words_option): + ignore_words = set() + if ignore_words_option: + for comma_separated_words in ignore_words_option: + for word in comma_separated_words.split(','): + ignore_words.add(word.strip()) + return ignore_words + + def build_exclude_hashes(filename, exclude_lines): with codecs.open(filename, 'r') as f: for line in f: @@ -502,8 +519,20 @@ def extract_words(text, word_regex, ignore_word_regex): return word_regex.findall(text) +def apply_uri_ignore_words(check_words, line, word_regex, ignore_word_regex, + uri_regex, uri_ignore_words): + if not uri_ignore_words: + return + for uri in re.findall(uri_regex, line): + for uri_word in extract_words(uri, word_regex, + ignore_word_regex): + if "*" in uri_ignore_words or uri_word in uri_ignore_words: + check_words.remove(uri_word) + + def parse_file(filename, colors, summary, misspellings, exclude_lines, - file_opener, word_regex, ignore_word_regex, context, options): + file_opener, word_regex, ignore_word_regex, uri_regex, + uri_ignore_words, context, options): bad_count = 0 lines = None changed = False @@ -568,7 +597,11 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines, fixed_words = set() asked_for = set() - for word in extract_words(line, word_regex, ignore_word_regex): + check_words = extract_words(line, word_regex, ignore_word_regex) + apply_uri_ignore_words(check_words, line, word_regex, ignore_word_regex, + uri_regex, uri_ignore_words) + + for word in check_words: lword = word.lower() if lword in misspellings: context_shown = False @@ -688,7 +721,7 @@ def main(*args): ignore_word_regex = None ignore_words_files = options.ignore_words or [] - ignore_words = set() + ignore_words = parse_ignore_words_option(options.ignore_words_list) for ignore_words_file in ignore_words_files: if not os.path.isfile(ignore_words_file): print("ERROR: cannot find ignore-words file: %s" % @@ -697,10 +730,8 @@ def main(*args): return EX_USAGE build_ignore_words(ignore_words_file, ignore_words) - ignore_words_list = options.ignore_words_list or [] - for comma_separated_words in ignore_words_list: - for word in comma_separated_words.split(','): - ignore_words.add(word.strip()) + uri_regex = re.compile(uri_regex_def) + uri_ignore_words = parse_ignore_words_option(options.uri_ignore_words_list) if options.dictionary: dictionaries = options.dictionary @@ -794,8 +825,8 @@ def main(*args): continue bad_count += parse_file( fname, colors, summary, misspellings, exclude_lines, - file_opener, word_regex, ignore_word_regex, context, - options) + file_opener, word_regex, ignore_word_regex, uri_regex, + uri_ignore_words, context, options) # skip (relative) directories dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)] @@ -803,7 +834,8 @@ def main(*args): else: bad_count += parse_file( filename, colors, summary, misspellings, exclude_lines, - file_opener, word_regex, ignore_word_regex, context, options) + file_opener, word_regex, ignore_word_regex, uri_regex, + uri_ignore_words, context, options) if summary: print("\n-------8<-------\nSUMMARY:") diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 876e24ce..7afe15de 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -455,8 +455,8 @@ def test_context(tmpdir, capsys): assert 'ERROR' in lines[0] -def test_ignore_regex_flag(tmpdir, capsys): - """Test ignore regex flag functionality.""" +def test_ignore_regex_option(tmpdir, capsys): + """Test ignore regex option functionality.""" d = str(tmpdir) # Invalid regex. @@ -485,7 +485,116 @@ def test_ignore_regex_flag(tmpdir, capsys): # Ignoring donn breaks them both. assert cs.main(f.name, '--ignore-regex=donn') == 0 # Adding word breaks causes only one to be ignored. - assert cs.main(f.name, r'--ignore-regex=\Wdonn\W') == 1 + assert cs.main(f.name, r'--ignore-regex=\bdonn\b') == 1 + + +def test_uri_ignore_words_list_option_uri(tmpdir, capsys): + """Test ignore regex option functionality.""" + d = str(tmpdir) + + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('# Please see http://example.com/abandonned for info\n') + # Test file has 1 invalid entry, and it's not ignored by default. + assert cs.main(f.name) == 1 + # An empty list is the default value, and nothing is ignored. + assert cs.main(f.name, '--uri-ignore-words-list=') == 1 + # Non-matching regex results in nothing being ignored. + assert cs.main(f.name, '--uri-ignore-words-list=foo,example') == 1 + # A word can be ignored. + assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0 + assert cs.main(f.name, '--uri-ignore-words-list=foo,abandonned,bar') == 0 + assert cs.main(f.name, '--uri-ignore-words-list=*') == 0 + # The match must be for the complete word. + assert cs.main(f.name, '--uri-ignore-words-list=abandonn') == 1 + + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('abandonned http://example.com/abandonned\n') + # Test file has 2 invalid entries. + assert cs.main(f.name) == 2 + # Ignoring the value in the URI won't ignore the word completely. + assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1 + assert cs.main(f.name, '--uri-ignore-words-list=*') == 1 + # The regular --ignore-words-list will ignore both. + assert cs.main(f.name, '--ignore-words-list=abandonned') == 0 + + variation_option = '--uri-ignore-words-list=abandonned' + + # Variations where an error is ignored. + for variation in ('# Please see http://abandonned for info\n', + '# Please see "http://abandonned" for info\n', + '# Please see https://abandonned for info\n', + '# Please see ftp://abandonned for info\n', + '# Please see http://example/abandonned for info\n', + '# Please see http://example.com/abandonned for info\n', + '# Please see http://example.com/abandonned for info\n', + '# Please see http://exam.com/ple#abandonned for info\n', + '# Please see http://exam.com/ple?abandonned for info\n', + '# Please see http://127.0.0.1/abandonned for info\n', + '# Please see http://[2001:0db8:85a3:0000:0000:8a2e:0370:' + '7334]/abandonned for info\n'): + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write(variation) + assert cs.main(f.name) == 1, variation + assert cs.main(f.name, variation_option) == 0, variation + + # Variations where no error is ignored. + for variation in ('# Please see abandonned/ for info\n', + '# Please see http:abandonned for info\n', + '# Please see foo/abandonned for info\n', + '# Please see http://foo abandonned for info\n', + '# Please see "http://foo"abandonned for info\n'): + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write(variation) + assert cs.main(f.name) == 1, variation + assert cs.main(f.name, variation_option) == 1, variation + + +def test_uri_ignore_words_list_option_email(tmpdir, capsys): + """Test ignore regex option functionality.""" + d = str(tmpdir) + + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('# Please see example@abandonned.com for info\n') + # Test file has 1 invalid entry, and it's not ignored by default. + assert cs.main(f.name) == 1 + # An empty list is the default value, and nothing is ignored. + assert cs.main(f.name, '--uri-ignore-words-list=') == 1 + # Non-matching regex results in nothing being ignored. + assert cs.main(f.name, '--uri-ignore-words-list=foo,example') == 1 + # A word can be ignored. + assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0 + assert cs.main(f.name, '--uri-ignore-words-list=foo,abandonned,bar') == 0 + assert cs.main(f.name, '--uri-ignore-words-list=*') == 0 + # The match must be for the complete word. + assert cs.main(f.name, '--uri-ignore-words-list=abandonn') == 1 + + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('abandonned example@abandonned.com\n') + # Test file has 2 invalid entries. + assert cs.main(f.name) == 2 + # Ignoring the value in the URI won't ignore the word completely. + assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1 + assert cs.main(f.name, '--uri-ignore-words-list=*') == 1 + # The regular --ignore-words-list will ignore both. + assert cs.main(f.name, '--ignore-words-list=abandonned') == 0 + + variation_option = '--uri-ignore-words-list=abandonned' + + # Variations where an error is ignored. + for variation in ('# Please see example@abandonned for info\n', + '# Please see abandonned@example for info\n'): + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write(variation) + assert cs.main(f.name) == 1, variation + assert cs.main(f.name, variation_option) == 0, variation + + # Variations where no error is ignored. + for variation in ('# Please see example @ abandonned for info\n', + '# Please see abandonned@ example for info\n'): + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write(variation) + assert cs.main(f.name) == 1, variation + assert cs.main(f.name, variation_option) == 1, variation @contextlib.contextmanager