Add --ignore-regex for URI/email handling.

This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them. Mechanically, this erases the URI/email text before the word regex is applied.
2025-08-06 01:36:26 +08:00 · 2020-07-08 09:19:27 -07:00
parent 540c85e8cc
commit 018f0c6465
2 changed files with 90 additions and 5 deletions
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@ -28,6 +28,8 @@ import sys
 import textwrap
 word_regex_def = u"[\\w\\-'’`]+"
 # Matches common URIs and email addresses, in that order.
 ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})"  # noqa: E501
 encodings = ('utf-8', 'iso-8859-1')
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@ -273,6 +275,11 @@ def parse_options(args):
                        'to include (when "-D -" or no "-D" is passed). '
                        'Current options are:' + builtin_opts + '\n'
                        'The default is %(default)r.')
    parser.add_argument('--ignore-regex',
                        action='store', type=str,
                        help='regular expression which is used to find words '
                             'to ignore. Matches URIs and emails by default. '
                             'Can be disabled by setting to "^$".')
    parser.add_argument('-I', '--ignore-words',
                        action='append', metavar='FILE',
                        help='file that contains words which will be ignored '
@ -489,8 +496,13 @@ def print_context(lines, index, context):
            print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
 def extract_words(text, word_regex, ignore_word_regex):
    interesting_text = ignore_word_regex.sub(' ', text)
    return word_regex.findall(interesting_text)
 def parse_file(filename, colors, summary, misspellings, exclude_lines,
-               file_opener, word_regex, context, options):
+               file_opener, word_regex, ignore_word_regex, context, options):
    bad_count = 0
    lines = None
    changed = False
@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
        lines = f.readlines()
    else:
        if options.check_filenames:
-            for word in word_regex.findall(filename):
+            for word in extract_words(filename, word_regex, ignore_word_regex):
                lword = word.lower()
                if lword not in misspellings:
                    continue
@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
        fixed_words = set()
        asked_for = set()
-        for word in word_regex.findall(line):
+        for word in extract_words(line, word_regex, ignore_word_regex):
            lword = word.lower()
            if lword in misspellings:
                context_shown = False
@ -662,6 +674,14 @@ def main(*args):
              (word_regex, err), file=sys.stderr)
        parser.print_help()
        return EX_USAGE
    ignore_word_regex = options.ignore_regex or ignore_word_regex_def
    try:
        ignore_word_regex = re.compile(ignore_word_regex)
    except re.error as err:
        print("ERROR: invalid regular expression \"%s\" (%s)" %
              (ignore_word_regex, err), file=sys.stderr)
        parser.print_help()
        return EX_USAGE
    ignore_words_files = options.ignore_words or []
    ignore_words = set()
@ -770,7 +790,8 @@ def main(*args):
                        continue
                    bad_count += parse_file(
                        fname, colors, summary, misspellings, exclude_lines,
-                        file_opener, word_regex, context, options)
+                        file_opener, word_regex, ignore_word_regex, context,
                        options)
                # skip (relative) directories
                dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
@ -778,7 +799,7 @@ def main(*args):
        else:
            bad_count += parse_file(
                filename, colors, summary, misspellings, exclude_lines,
-                file_opener, word_regex, context, options)
+                file_opener, word_regex, ignore_word_regex, context, options)
    if summary:
        print("\n-------8<-------\nSUMMARY:")
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@ -455,6 +455,70 @@ def test_context(tmpdir, capsys):
    assert 'ERROR' in lines[0]
 def test_uri(tmpdir, capsys):
    """Test ignore regex functionality for URIs."""
    d = str(tmpdir)
    # Ignoring text in path.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see http://example.com/abandonned for info\n')
    assert cs.main(f.name) == 0
    # Same is a typo with ignores disabled.
    assert cs.main(f.name, '--ignore-regex=^$') == 1
    # Test a different protocol.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see https://example.com/abandonned for info\n')
    assert cs.main(f.name) == 0
    # Ignoring text in path ending with /.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see http://example.com/abandonned/ for info\n')
    assert cs.main(f.name) == 0
    # Ignoring text in domain.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see http://abandonned.com/example for info\n')
    assert cs.main(f.name) == 0
    # Ignoring text in anchor.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see http://example.com/ex#abandonned for info\n')
    assert cs.main(f.name) == 0
    # Typo because there's no protocol.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see example.com/abandonned for info\n')
    assert cs.main(f.name) == 1
    # Typo because there aren't enough domain parts.
    with open(op.join(d, 'uri.txt'), 'w') as f:
        f.write('# Please see http://abandonned for info\n')
    assert cs.main(f.name) == 1
 def test_email(tmpdir, capsys):
    """Test ignore regex functionality for emails."""
    d = str(tmpdir)
    # Ignoring text in username.
    with open(op.join(d, 'email.txt'), 'w') as f:
        f.write('# Please contact abandonned@example.com for info\n')
    assert cs.main(f.name) == 0
    # Same is a typo with ignores disabled.
    assert cs.main(f.name, '--ignore-regex=^$') == 1
    # Ignoring text in domain.
    with open(op.join(d, 'email.txt'), 'w') as f:
        f.write('# Please contact example@abandonned.com for info\n')
    assert cs.main(f.name) == 0
    # Typo because there's no TLD for an email.
    with open(op.join(d, 'email.txt'), 'w') as f:
        f.write('# Please contact abandonned@example for info\n')
    assert cs.main(f.name) == 1
@contextlib.contextmanager
 def FakeStdin(text):
    if sys.version[0] == '2':