From 1e7f5f7e6f463da5e04447ac8995e611a3bbc770 Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Fri, 28 Aug 2020 14:19:52 -0700
Subject: [PATCH] Add support for ignoring spelling mistakes in URIs
 specifically.

---
 codespell_lib/_codespell.py       |  54 +++++++++++---
 codespell_lib/tests/test_basic.py | 115 +++++++++++++++++++++++++++++-
 2 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 83ebaa4e..2296ea94 100755
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -28,6 +28,7 @@ import sys
 import textwrap
 
 word_regex_def = u"[\\w\\-'’`]+"
+uri_regex_def = u"\\b((?:https?|t?ftp|file|git|smb)://[^\\s'\"]*|[\\w.%+-]+@[\\w.-]+)\\b"
 encodings = ('utf-8', 'iso-8859-1')
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@@ -279,7 +280,7 @@ def parse_options(args):
                              'patterns to ignore by treating as whitespace. '
                              'When writing regexes, consider ensuring there '
                              'are boundary non-word chars, e.g., '
-                             '"\\Wmatch\\W". Defaults to empty/disabled.')
+                             '"\\bmatch\\b". Defaults to empty/disabled.')
     parser.add_argument('-I', '--ignore-words',
                         action='append', metavar='FILE',
                         help='file that contains words which will be ignored '
@@ -291,6 +292,13 @@ def parse_options(args):
                         help='comma separated list of words to be ignored '
                              'by codespell. Words are case sensitive based on '
                              'how they are written in the dictionary file')
+    parser.add_argument('--uri-ignore-words-list',
+                        action='append', metavar='WORDS',
+                        help='comma separated list of words to be ignored '
+                             'by codespell in URIs and emails only. Words are '
+                             'case sensitive based on how they are written in '
+                             'the dictionary file. If set to "*", all '
+                             'misspelling in URIs and emails will be ignored.')
     parser.add_argument('-r', '--regex',
                         action='store', type=str,
                         help='regular expression which is used to find words. '
@@ -373,6 +381,15 @@ def parse_options(args):
     return options, parser
 
 
+def parse_ignore_words_option(ignore_words_option):
+    ignore_words = set()
+    if ignore_words_option:
+        for comma_separated_words in ignore_words_option:
+            for word in comma_separated_words.split(','):
+                ignore_words.add(word.strip())
+    return ignore_words
+
+
 def build_exclude_hashes(filename, exclude_lines):
     with codecs.open(filename, 'r') as f:
         for line in f:
@@ -502,8 +519,20 @@ def extract_words(text, word_regex, ignore_word_regex):
     return word_regex.findall(text)
 
 
+def apply_uri_ignore_words(check_words, line, word_regex, ignore_word_regex,
+                           uri_regex, uri_ignore_words):
+    if not uri_ignore_words:
+        return
+    for uri in re.findall(uri_regex, line):
+        for uri_word in extract_words(uri, word_regex,
+                                      ignore_word_regex):
+            if "*" in uri_ignore_words or uri_word in uri_ignore_words:
+                check_words.remove(uri_word)
+
+
 def parse_file(filename, colors, summary, misspellings, exclude_lines,
-               file_opener, word_regex, ignore_word_regex, context, options):
+               file_opener, word_regex, ignore_word_regex, uri_regex,
+               uri_ignore_words, context, options):
     bad_count = 0
     lines = None
     changed = False
@@ -568,7 +597,11 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
         fixed_words = set()
         asked_for = set()
 
-        for word in extract_words(line, word_regex, ignore_word_regex):
+        check_words = extract_words(line, word_regex, ignore_word_regex)
+        apply_uri_ignore_words(check_words, line, word_regex, ignore_word_regex,
+                               uri_regex, uri_ignore_words)
+
+        for word in check_words:
             lword = word.lower()
             if lword in misspellings:
                 context_shown = False
@@ -688,7 +721,7 @@ def main(*args):
         ignore_word_regex = None
 
     ignore_words_files = options.ignore_words or []
-    ignore_words = set()
+    ignore_words = parse_ignore_words_option(options.ignore_words_list)
     for ignore_words_file in ignore_words_files:
         if not os.path.isfile(ignore_words_file):
             print("ERROR: cannot find ignore-words file: %s" %
@@ -697,10 +730,8 @@ def main(*args):
             return EX_USAGE
         build_ignore_words(ignore_words_file, ignore_words)
 
-    ignore_words_list = options.ignore_words_list or []
-    for comma_separated_words in ignore_words_list:
-        for word in comma_separated_words.split(','):
-            ignore_words.add(word.strip())
+    uri_regex = re.compile(uri_regex_def)
+    uri_ignore_words = parse_ignore_words_option(options.uri_ignore_words_list)
 
     if options.dictionary:
         dictionaries = options.dictionary
@@ -794,8 +825,8 @@ def main(*args):
                         continue
                     bad_count += parse_file(
                         fname, colors, summary, misspellings, exclude_lines,
-                        file_opener, word_regex, ignore_word_regex, context,
-                        options)
+                        file_opener, word_regex, ignore_word_regex, uri_regex,
+                        uri_ignore_words, context, options)
 
                 # skip (relative) directories
                 dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
@@ -803,7 +834,8 @@ def main(*args):
         else:
             bad_count += parse_file(
                 filename, colors, summary, misspellings, exclude_lines,
-                file_opener, word_regex, ignore_word_regex, context, options)
+                file_opener, word_regex, ignore_word_regex, uri_regex,
+                uri_ignore_words, context, options)
 
     if summary:
         print("\n-------8<-------\nSUMMARY:")
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index 876e24ce..7afe15de 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -455,8 +455,8 @@ def test_context(tmpdir, capsys):
     assert 'ERROR' in lines[0]
 
 
-def test_ignore_regex_flag(tmpdir, capsys):
-    """Test ignore regex flag functionality."""
+def test_ignore_regex_option(tmpdir, capsys):
+    """Test ignore regex option functionality."""
     d = str(tmpdir)
 
     # Invalid regex.
@@ -485,7 +485,116 @@ def test_ignore_regex_flag(tmpdir, capsys):
     # Ignoring donn breaks them both.
     assert cs.main(f.name, '--ignore-regex=donn') == 0
     # Adding word breaks causes only one to be ignored.
-    assert cs.main(f.name, r'--ignore-regex=\Wdonn\W') == 1
+    assert cs.main(f.name, r'--ignore-regex=\bdonn\b') == 1
+
+
+def test_uri_ignore_words_list_option_uri(tmpdir, capsys):
+    """Test ignore regex option functionality."""
+    d = str(tmpdir)
+
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('# Please see http://example.com/abandonned for info\n')
+    # Test file has 1 invalid entry, and it's not ignored by default.
+    assert cs.main(f.name) == 1
+    # An empty list is the default value, and nothing is ignored.
+    assert cs.main(f.name, '--uri-ignore-words-list=') == 1
+    # Non-matching regex results in nothing being ignored.
+    assert cs.main(f.name, '--uri-ignore-words-list=foo,example') == 1
+    # A word can be ignored.
+    assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0
+    assert cs.main(f.name, '--uri-ignore-words-list=foo,abandonned,bar') == 0
+    assert cs.main(f.name, '--uri-ignore-words-list=*') == 0
+    # The match must be for the complete word.
+    assert cs.main(f.name, '--uri-ignore-words-list=abandonn') == 1
+
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('abandonned http://example.com/abandonned\n')
+    # Test file has 2 invalid entries.
+    assert cs.main(f.name) == 2
+    # Ignoring the value in the URI won't ignore the word completely.
+    assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1
+    assert cs.main(f.name, '--uri-ignore-words-list=*') == 1
+    # The regular --ignore-words-list will ignore both.
+    assert cs.main(f.name, '--ignore-words-list=abandonned') == 0
+
+    variation_option = '--uri-ignore-words-list=abandonned'
+
+    # Variations where an error is ignored.
+    for variation in ('# Please see http://abandonned for info\n',
+                      '# Please see "http://abandonned" for info\n',
+                      '# Please see https://abandonned for info\n',
+                      '# Please see ftp://abandonned for info\n',
+                      '# Please see http://example/abandonned for info\n',
+                      '# Please see http://example.com/abandonned for info\n',
+                      '# Please see http://example.com/abandonned for info\n',
+                      '# Please see http://exam.com/ple#abandonned for info\n',
+                      '# Please see http://exam.com/ple?abandonned for info\n',
+                      '# Please see http://127.0.0.1/abandonned for info\n',
+                      '# Please see http://[2001:0db8:85a3:0000:0000:8a2e:0370:'
+                      '7334]/abandonned for info\n'):
+        with open(op.join(d, 'flag.txt'), 'w') as f:
+            f.write(variation)
+        assert cs.main(f.name) == 1, variation
+        assert cs.main(f.name, variation_option) == 0, variation
+
+    # Variations where no error is ignored.
+    for variation in ('# Please see abandonned/ for info\n',
+                      '# Please see http:abandonned for info\n',
+                      '# Please see foo/abandonned for info\n',
+                      '# Please see http://foo abandonned for info\n',
+                      '# Please see "http://foo"abandonned for info\n'):
+        with open(op.join(d, 'flag.txt'), 'w') as f:
+            f.write(variation)
+        assert cs.main(f.name) == 1, variation
+        assert cs.main(f.name, variation_option) == 1, variation
+
+
+def test_uri_ignore_words_list_option_email(tmpdir, capsys):
+    """Test ignore regex option functionality."""
+    d = str(tmpdir)
+
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('# Please see example@abandonned.com for info\n')
+    # Test file has 1 invalid entry, and it's not ignored by default.
+    assert cs.main(f.name) == 1
+    # An empty list is the default value, and nothing is ignored.
+    assert cs.main(f.name, '--uri-ignore-words-list=') == 1
+    # Non-matching regex results in nothing being ignored.
+    assert cs.main(f.name, '--uri-ignore-words-list=foo,example') == 1
+    # A word can be ignored.
+    assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0
+    assert cs.main(f.name, '--uri-ignore-words-list=foo,abandonned,bar') == 0
+    assert cs.main(f.name, '--uri-ignore-words-list=*') == 0
+    # The match must be for the complete word.
+    assert cs.main(f.name, '--uri-ignore-words-list=abandonn') == 1
+
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('abandonned example@abandonned.com\n')
+    # Test file has 2 invalid entries.
+    assert cs.main(f.name) == 2
+    # Ignoring the value in the URI won't ignore the word completely.
+    assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1
+    assert cs.main(f.name, '--uri-ignore-words-list=*') == 1
+    # The regular --ignore-words-list will ignore both.
+    assert cs.main(f.name, '--ignore-words-list=abandonned') == 0
+
+    variation_option = '--uri-ignore-words-list=abandonned'
+
+    # Variations where an error is ignored.
+    for variation in ('# Please see example@abandonned for info\n',
+                      '# Please see abandonned@example for info\n'):
+        with open(op.join(d, 'flag.txt'), 'w') as f:
+            f.write(variation)
+        assert cs.main(f.name) == 1, variation
+        assert cs.main(f.name, variation_option) == 0, variation
+
+    # Variations where no error is ignored.
+    for variation in ('# Please see example @ abandonned for info\n',
+                      '# Please see abandonned@ example for info\n'):
+        with open(op.join(d, 'flag.txt'), 'w') as f:
+            f.write(variation)
+        assert cs.main(f.name) == 1, variation
+        assert cs.main(f.name, variation_option) == 1, variation
 
 
 @contextlib.contextmanager