mirror of
https://github.com/codespell-project/codespell.git
synced 2025-08-06 01:36:26 +08:00
Add --ignore-regex for URI/email handling.
This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them. Mechanically, this erases the URI/email text before the word regex is applied.
This commit is contained in:
@ -28,6 +28,8 @@ import sys
|
|||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
word_regex_def = u"[\\w\\-'’`]+"
|
word_regex_def = u"[\\w\\-'’`]+"
|
||||||
|
# Matches common URIs and email addresses, in that order.
|
||||||
|
ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})" # noqa: E501
|
||||||
encodings = ('utf-8', 'iso-8859-1')
|
encodings = ('utf-8', 'iso-8859-1')
|
||||||
USAGE = """
|
USAGE = """
|
||||||
\t%prog [OPTIONS] [file1 file2 ... fileN]
|
\t%prog [OPTIONS] [file1 file2 ... fileN]
|
||||||
@ -273,6 +275,11 @@ def parse_options(args):
|
|||||||
'to include (when "-D -" or no "-D" is passed). '
|
'to include (when "-D -" or no "-D" is passed). '
|
||||||
'Current options are:' + builtin_opts + '\n'
|
'Current options are:' + builtin_opts + '\n'
|
||||||
'The default is %(default)r.')
|
'The default is %(default)r.')
|
||||||
|
parser.add_argument('--ignore-regex',
|
||||||
|
action='store', type=str,
|
||||||
|
help='regular expression which is used to find words '
|
||||||
|
'to ignore. Matches URIs and emails by default. '
|
||||||
|
'Can be disabled by setting to "^$".')
|
||||||
parser.add_argument('-I', '--ignore-words',
|
parser.add_argument('-I', '--ignore-words',
|
||||||
action='append', metavar='FILE',
|
action='append', metavar='FILE',
|
||||||
help='file that contains words which will be ignored '
|
help='file that contains words which will be ignored '
|
||||||
@ -489,8 +496,13 @@ def print_context(lines, index, context):
|
|||||||
print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
|
print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_words(text, word_regex, ignore_word_regex):
|
||||||
|
interesting_text = ignore_word_regex.sub(' ', text)
|
||||||
|
return word_regex.findall(interesting_text)
|
||||||
|
|
||||||
|
|
||||||
def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
||||||
file_opener, word_regex, context, options):
|
file_opener, word_regex, ignore_word_regex, context, options):
|
||||||
bad_count = 0
|
bad_count = 0
|
||||||
lines = None
|
lines = None
|
||||||
changed = False
|
changed = False
|
||||||
@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
|||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
else:
|
else:
|
||||||
if options.check_filenames:
|
if options.check_filenames:
|
||||||
for word in word_regex.findall(filename):
|
for word in extract_words(filename, word_regex, ignore_word_regex):
|
||||||
lword = word.lower()
|
lword = word.lower()
|
||||||
if lword not in misspellings:
|
if lword not in misspellings:
|
||||||
continue
|
continue
|
||||||
@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
|
|||||||
fixed_words = set()
|
fixed_words = set()
|
||||||
asked_for = set()
|
asked_for = set()
|
||||||
|
|
||||||
for word in word_regex.findall(line):
|
for word in extract_words(line, word_regex, ignore_word_regex):
|
||||||
lword = word.lower()
|
lword = word.lower()
|
||||||
if lword in misspellings:
|
if lword in misspellings:
|
||||||
context_shown = False
|
context_shown = False
|
||||||
@ -662,6 +674,14 @@ def main(*args):
|
|||||||
(word_regex, err), file=sys.stderr)
|
(word_regex, err), file=sys.stderr)
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return EX_USAGE
|
return EX_USAGE
|
||||||
|
ignore_word_regex = options.ignore_regex or ignore_word_regex_def
|
||||||
|
try:
|
||||||
|
ignore_word_regex = re.compile(ignore_word_regex)
|
||||||
|
except re.error as err:
|
||||||
|
print("ERROR: invalid regular expression \"%s\" (%s)" %
|
||||||
|
(ignore_word_regex, err), file=sys.stderr)
|
||||||
|
parser.print_help()
|
||||||
|
return EX_USAGE
|
||||||
|
|
||||||
ignore_words_files = options.ignore_words or []
|
ignore_words_files = options.ignore_words or []
|
||||||
ignore_words = set()
|
ignore_words = set()
|
||||||
@ -770,7 +790,8 @@ def main(*args):
|
|||||||
continue
|
continue
|
||||||
bad_count += parse_file(
|
bad_count += parse_file(
|
||||||
fname, colors, summary, misspellings, exclude_lines,
|
fname, colors, summary, misspellings, exclude_lines,
|
||||||
file_opener, word_regex, context, options)
|
file_opener, word_regex, ignore_word_regex, context,
|
||||||
|
options)
|
||||||
|
|
||||||
# skip (relative) directories
|
# skip (relative) directories
|
||||||
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
|
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
|
||||||
@ -778,7 +799,7 @@ def main(*args):
|
|||||||
else:
|
else:
|
||||||
bad_count += parse_file(
|
bad_count += parse_file(
|
||||||
filename, colors, summary, misspellings, exclude_lines,
|
filename, colors, summary, misspellings, exclude_lines,
|
||||||
file_opener, word_regex, context, options)
|
file_opener, word_regex, ignore_word_regex, context, options)
|
||||||
|
|
||||||
if summary:
|
if summary:
|
||||||
print("\n-------8<-------\nSUMMARY:")
|
print("\n-------8<-------\nSUMMARY:")
|
||||||
|
@ -455,6 +455,70 @@ def test_context(tmpdir, capsys):
|
|||||||
assert 'ERROR' in lines[0]
|
assert 'ERROR' in lines[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_uri(tmpdir, capsys):
|
||||||
|
"""Test ignore regex functionality for URIs."""
|
||||||
|
d = str(tmpdir)
|
||||||
|
|
||||||
|
# Ignoring text in path.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see http://example.com/abandonned for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
# Same is a typo with ignores disabled.
|
||||||
|
assert cs.main(f.name, '--ignore-regex=^$') == 1
|
||||||
|
|
||||||
|
# Test a different protocol.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see https://example.com/abandonned for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
|
||||||
|
# Ignoring text in path ending with /.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see http://example.com/abandonned/ for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
|
||||||
|
# Ignoring text in domain.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see http://abandonned.com/example for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
|
||||||
|
# Ignoring text in anchor.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see http://example.com/ex#abandonned for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
|
||||||
|
# Typo because there's no protocol.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see example.com/abandonned for info\n')
|
||||||
|
assert cs.main(f.name) == 1
|
||||||
|
|
||||||
|
# Typo because there aren't enough domain parts.
|
||||||
|
with open(op.join(d, 'uri.txt'), 'w') as f:
|
||||||
|
f.write('# Please see http://abandonned for info\n')
|
||||||
|
assert cs.main(f.name) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_email(tmpdir, capsys):
|
||||||
|
"""Test ignore regex functionality for emails."""
|
||||||
|
d = str(tmpdir)
|
||||||
|
|
||||||
|
# Ignoring text in username.
|
||||||
|
with open(op.join(d, 'email.txt'), 'w') as f:
|
||||||
|
f.write('# Please contact abandonned@example.com for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
# Same is a typo with ignores disabled.
|
||||||
|
assert cs.main(f.name, '--ignore-regex=^$') == 1
|
||||||
|
|
||||||
|
# Ignoring text in domain.
|
||||||
|
with open(op.join(d, 'email.txt'), 'w') as f:
|
||||||
|
f.write('# Please contact example@abandonned.com for info\n')
|
||||||
|
assert cs.main(f.name) == 0
|
||||||
|
|
||||||
|
# Typo because there's no TLD for an email.
|
||||||
|
with open(op.join(d, 'email.txt'), 'w') as f:
|
||||||
|
f.write('# Please contact abandonned@example for info\n')
|
||||||
|
assert cs.main(f.name) == 1
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def FakeStdin(text):
|
def FakeStdin(text):
|
||||||
if sys.version[0] == '2':
|
if sys.version[0] == '2':
|
||||||
|
Reference in New Issue
Block a user