mirror of
https://github.com/codespell-project/codespell.git
synced 2025-08-06 09:40:56 +08:00
Add --ignore-regex for URI/email handling.
This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them. Mechanically, this erases the URI/email text before the word regex is applied.
This commit is contained in:
@ -28,6 +28,8 @@ import sys
|
||||
import textwrap
|
||||
|
||||
word_regex_def = u"[\\w\\-'’`]+"
|
||||
# Matches common URIs and email addresses, in that order.
|
||||
ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})" # noqa: E501
|
||||
encodings = ('utf-8', 'iso-8859-1')
|
||||
USAGE = """
|
||||
\t%prog [OPTIONS] [file1 file2 ... fileN]
|
||||
@ -273,6 +275,11 @@ def parse_options(args):
|
||||
'to include (when "-D -" or no "-D" is passed). '
|
||||
'Current options are:' + builtin_opts + '\n'
|
||||
'The default is %(default)r.')
|
||||
parser.add_argument('--ignore-regex',
|
||||
action='store', type=str,
|
||||
help='regular expression which is used to find words '
|
||||
'to ignore. Matches URIs and emails by default. '
|
||||
'Can be disabled by setting to "^$".')
|
||||
parser.add_argument('-I', '--ignore-words',
|
||||
action='append', metavar='FILE',
|
||||
help='file that contains words which will be ignored '
|
||||
@ -489,8 +496,13 @@ def print_context(lines, index, context):
|
||||
print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
|
||||
|
||||
|
||||
def extract_words(text, word_regex, ignore_word_regex):
|
||||
interesting_text = ignore_word_regex.sub(' ', text)
|
||||
return word_regex.findall(interesting_text)
|
||||
|
||||
|
||||
def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
||||
file_opener, word_regex, context, options):
|
||||
file_opener, word_regex, ignore_word_regex, context, options):
|
||||
bad_count = 0
|
||||
lines = None
|
||||
changed = False
|
||||
@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
||||
lines = f.readlines()
|
||||
else:
|
||||
if options.check_filenames:
|
||||
for word in word_regex.findall(filename):
|
||||
for word in extract_words(filename, word_regex, ignore_word_regex):
|
||||
lword = word.lower()
|
||||
if lword not in misspellings:
|
||||
continue
|
||||
@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
||||
fixed_words = set()
|
||||
asked_for = set()
|
||||
|
||||
for word in word_regex.findall(line):
|
||||
for word in extract_words(line, word_regex, ignore_word_regex):
|
||||
lword = word.lower()
|
||||
if lword in misspellings:
|
||||
context_shown = False
|
||||
@ -662,6 +674,14 @@ def main(*args):
|
||||
(word_regex, err), file=sys.stderr)
|
||||
parser.print_help()
|
||||
return EX_USAGE
|
||||
ignore_word_regex = options.ignore_regex or ignore_word_regex_def
|
||||
try:
|
||||
ignore_word_regex = re.compile(ignore_word_regex)
|
||||
except re.error as err:
|
||||
print("ERROR: invalid regular expression \"%s\" (%s)" %
|
||||
(ignore_word_regex, err), file=sys.stderr)
|
||||
parser.print_help()
|
||||
return EX_USAGE
|
||||
|
||||
ignore_words_files = options.ignore_words or []
|
||||
ignore_words = set()
|
||||
@ -770,7 +790,8 @@ def main(*args):
|
||||
continue
|
||||
bad_count += parse_file(
|
||||
fname, colors, summary, misspellings, exclude_lines,
|
||||
file_opener, word_regex, context, options)
|
||||
file_opener, word_regex, ignore_word_regex, context,
|
||||
options)
|
||||
|
||||
# skip (relative) directories
|
||||
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
|
||||
@ -778,7 +799,7 @@ def main(*args):
|
||||
else:
|
||||
bad_count += parse_file(
|
||||
filename, colors, summary, misspellings, exclude_lines,
|
||||
file_opener, word_regex, context, options)
|
||||
file_opener, word_regex, ignore_word_regex, context, options)
|
||||
|
||||
if summary:
|
||||
print("\n-------8<-------\nSUMMARY:")
|
||||
|
@ -455,6 +455,70 @@ def test_context(tmpdir, capsys):
|
||||
assert 'ERROR' in lines[0]
|
||||
|
||||
|
||||
def test_uri(tmpdir, capsys):
|
||||
"""Test ignore regex functionality for URIs."""
|
||||
d = str(tmpdir)
|
||||
|
||||
# Ignoring text in path.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see http://example.com/abandonned for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
# Same is a typo with ignores disabled.
|
||||
assert cs.main(f.name, '--ignore-regex=^$') == 1
|
||||
|
||||
# Test a different protocol.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see https://example.com/abandonned for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
|
||||
# Ignoring text in path ending with /.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see http://example.com/abandonned/ for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
|
||||
# Ignoring text in domain.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see http://abandonned.com/example for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
|
||||
# Ignoring text in anchor.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see http://example.com/ex#abandonned for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
|
||||
# Typo because there's no protocol.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see example.com/abandonned for info\n')
|
||||
assert cs.main(f.name) == 1
|
||||
|
||||
# Typo because there aren't enough domain parts.
|
||||
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||
f.write('# Please see http://abandonned for info\n')
|
||||
assert cs.main(f.name) == 1
|
||||
|
||||
|
||||
def test_email(tmpdir, capsys):
|
||||
"""Test ignore regex functionality for emails."""
|
||||
d = str(tmpdir)
|
||||
|
||||
# Ignoring text in username.
|
||||
with open(op.join(d, 'email.txt'), 'w') as f:
|
||||
f.write('# Please contact abandonned@example.com for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
# Same is a typo with ignores disabled.
|
||||
assert cs.main(f.name, '--ignore-regex=^$') == 1
|
||||
|
||||
# Ignoring text in domain.
|
||||
with open(op.join(d, 'email.txt'), 'w') as f:
|
||||
f.write('# Please contact example@abandonned.com for info\n')
|
||||
assert cs.main(f.name) == 0
|
||||
|
||||
# Typo because there's no TLD for an email.
|
||||
with open(op.join(d, 'email.txt'), 'w') as f:
|
||||
f.write('# Please contact abandonned@example for info\n')
|
||||
assert cs.main(f.name) == 1
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def FakeStdin(text):
|
||||
if sys.version[0] == '2':
|
||||
|
Reference in New Issue
Block a user