Add --ignore-regex for URI/email handling.

This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them. Mechanically, this erases the URI/email text before the word regex is applied.
2025-08-06 09:40:56 +08:00 · 2020-07-08 09:19:27 -07:00
parent 540c85e8cc
commit 018f0c6465
2 changed files with 90 additions and 5 deletions
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@ -28,6 +28,8 @@ import sys
 import textwrap

 word_regex_def = u"[\\w\\-'’`]+"
+# Matches common URIs and email addresses, in that order.
+ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})"  # noqa: E501
 encodings = ('utf-8', 'iso-8859-1')
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@ -273,6 +275,11 @@ def parse_options(args):
                        'to include (when "-D -" or no "-D" is passed). '
                        'Current options are:' + builtin_opts + '\n'
                        'The default is %(default)r.')
+    parser.add_argument('--ignore-regex',
+                        action='store', type=str,
+                        help='regular expression which is used to find words '
+                             'to ignore. Matches URIs and emails by default. '
+                             'Can be disabled by setting to "^$".')
    parser.add_argument('-I', '--ignore-words',
                        action='append', metavar='FILE',
                        help='file that contains words which will be ignored '
@ -489,8 +496,13 @@ def print_context(lines, index, context):
            print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))


+def extract_words(text, word_regex, ignore_word_regex):
+    interesting_text = ignore_word_regex.sub(' ', text)
+    return word_regex.findall(interesting_text)
+
+
 def parse_file(filename, colors, summary, misspellings, exclude_lines,
-               file_opener, word_regex, context, options):
+               file_opener, word_regex, ignore_word_regex, context, options):
    bad_count = 0
    lines = None
    changed = False
@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
        lines = f.readlines()
    else:
        if options.check_filenames:
-            for word in word_regex.findall(filename):
+            for word in extract_words(filename, word_regex, ignore_word_regex):
                lword = word.lower()
                if lword not in misspellings:
                    continue
@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
        fixed_words = set()
        asked_for = set()

-        for word in word_regex.findall(line):
+        for word in extract_words(line, word_regex, ignore_word_regex):
            lword = word.lower()
            if lword in misspellings:
                context_shown = False
@ -662,6 +674,14 @@ def main(*args):
              (word_regex, err), file=sys.stderr)
        parser.print_help()
        return EX_USAGE
+    ignore_word_regex = options.ignore_regex or ignore_word_regex_def
+    try:
+        ignore_word_regex = re.compile(ignore_word_regex)
+    except re.error as err:
+        print("ERROR: invalid regular expression \"%s\" (%s)" %
+              (ignore_word_regex, err), file=sys.stderr)
+        parser.print_help()
+        return EX_USAGE

    ignore_words_files = options.ignore_words or []
    ignore_words = set()
@ -770,7 +790,8 @@ def main(*args):
                        continue
                    bad_count += parse_file(
                        fname, colors, summary, misspellings, exclude_lines,
-                        file_opener, word_regex, context, options)
+                        file_opener, word_regex, ignore_word_regex, context,
+                        options)

                # skip (relative) directories
                dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
@ -778,7 +799,7 @@ def main(*args):
        else:
            bad_count += parse_file(
                filename, colors, summary, misspellings, exclude_lines,
-                file_opener, word_regex, context, options)
+                file_opener, word_regex, ignore_word_regex, context, options)

    if summary:
        print("\n-------8<-------\nSUMMARY:")
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@ -455,6 +455,70 @@ def test_context(tmpdir, capsys):
    assert 'ERROR' in lines[0]


+def test_uri(tmpdir, capsys):
+    """Test ignore regex functionality for URIs."""
+    d = str(tmpdir)
+
+    # Ignoring text in path.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://example.com/abandonned for info\n')
+    assert cs.main(f.name) == 0
+    # Same is a typo with ignores disabled.
+    assert cs.main(f.name, '--ignore-regex=^$') == 1
+
+    # Test a different protocol.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see https://example.com/abandonned for info\n')
+    assert cs.main(f.name) == 0
+
+    # Ignoring text in path ending with /.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://example.com/abandonned/ for info\n')
+    assert cs.main(f.name) == 0
+
+    # Ignoring text in domain.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://abandonned.com/example for info\n')
+    assert cs.main(f.name) == 0
+
+    # Ignoring text in anchor.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://example.com/ex#abandonned for info\n')
+    assert cs.main(f.name) == 0
+
+    # Typo because there's no protocol.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see example.com/abandonned for info\n')
+    assert cs.main(f.name) == 1
+
+    # Typo because there aren't enough domain parts.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://abandonned for info\n')
+    assert cs.main(f.name) == 1
+
+
+def test_email(tmpdir, capsys):
+    """Test ignore regex functionality for emails."""
+    d = str(tmpdir)
+
+    # Ignoring text in username.
+    with open(op.join(d, 'email.txt'), 'w') as f:
+        f.write('# Please contact abandonned@example.com for info\n')
+    assert cs.main(f.name) == 0
+    # Same is a typo with ignores disabled.
+    assert cs.main(f.name, '--ignore-regex=^$') == 1
+
+    # Ignoring text in domain.
+    with open(op.join(d, 'email.txt'), 'w') as f:
+        f.write('# Please contact example@abandonned.com for info\n')
+    assert cs.main(f.name) == 0
+
+    # Typo because there's no TLD for an email.
+    with open(op.join(d, 'email.txt'), 'w') as f:
+        f.write('# Please contact abandonned@example for info\n')
+    assert cs.main(f.name) == 1
+
+
@contextlib.contextmanager
 def FakeStdin(text):
    if sys.version[0] == '2':