Add --ignore-regex for URI/email handling.

This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them.

Mechanically, this erases the URI/email text before the word regex is applied.
This commit is contained in:
jonmeow
2020-07-08 09:19:27 -07:00
parent 540c85e8cc
commit 018f0c6465
2 changed files with 90 additions and 5 deletions

View File

@ -28,6 +28,8 @@ import sys
import textwrap import textwrap
word_regex_def = u"[\\w\\-'`]+" word_regex_def = u"[\\w\\-'`]+"
# Matches common URIs and email addresses, in that order.
ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})" # noqa: E501
encodings = ('utf-8', 'iso-8859-1') encodings = ('utf-8', 'iso-8859-1')
USAGE = """ USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN] \t%prog [OPTIONS] [file1 file2 ... fileN]
@ -273,6 +275,11 @@ def parse_options(args):
'to include (when "-D -" or no "-D" is passed). ' 'to include (when "-D -" or no "-D" is passed). '
'Current options are:' + builtin_opts + '\n' 'Current options are:' + builtin_opts + '\n'
'The default is %(default)r.') 'The default is %(default)r.')
parser.add_argument('--ignore-regex',
action='store', type=str,
help='regular expression which is used to find words '
'to ignore. Matches URIs and emails by default. '
'Can be disabled by setting to "^$".')
parser.add_argument('-I', '--ignore-words', parser.add_argument('-I', '--ignore-words',
action='append', metavar='FILE', action='append', metavar='FILE',
help='file that contains words which will be ignored ' help='file that contains words which will be ignored '
@ -489,8 +496,13 @@ def print_context(lines, index, context):
print('%s %s' % ('>' if i == index else ':', lines[i].rstrip())) print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
def extract_words(text, word_regex, ignore_word_regex):
interesting_text = ignore_word_regex.sub(' ', text)
return word_regex.findall(interesting_text)
def parse_file(filename, colors, summary, misspellings, exclude_lines, def parse_file(filename, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options): file_opener, word_regex, ignore_word_regex, context, options):
bad_count = 0 bad_count = 0
lines = None lines = None
changed = False changed = False
@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
lines = f.readlines() lines = f.readlines()
else: else:
if options.check_filenames: if options.check_filenames:
for word in word_regex.findall(filename): for word in extract_words(filename, word_regex, ignore_word_regex):
lword = word.lower() lword = word.lower()
if lword not in misspellings: if lword not in misspellings:
continue continue
@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
fixed_words = set() fixed_words = set()
asked_for = set() asked_for = set()
for word in word_regex.findall(line): for word in extract_words(line, word_regex, ignore_word_regex):
lword = word.lower() lword = word.lower()
if lword in misspellings: if lword in misspellings:
context_shown = False context_shown = False
@ -662,6 +674,14 @@ def main(*args):
(word_regex, err), file=sys.stderr) (word_regex, err), file=sys.stderr)
parser.print_help() parser.print_help()
return EX_USAGE return EX_USAGE
ignore_word_regex = options.ignore_regex or ignore_word_regex_def
try:
ignore_word_regex = re.compile(ignore_word_regex)
except re.error as err:
print("ERROR: invalid regular expression \"%s\" (%s)" %
(ignore_word_regex, err), file=sys.stderr)
parser.print_help()
return EX_USAGE
ignore_words_files = options.ignore_words or [] ignore_words_files = options.ignore_words or []
ignore_words = set() ignore_words = set()
@ -770,7 +790,8 @@ def main(*args):
continue continue
bad_count += parse_file( bad_count += parse_file(
fname, colors, summary, misspellings, exclude_lines, fname, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options) file_opener, word_regex, ignore_word_regex, context,
options)
# skip (relative) directories # skip (relative) directories
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)] dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
@ -778,7 +799,7 @@ def main(*args):
else: else:
bad_count += parse_file( bad_count += parse_file(
filename, colors, summary, misspellings, exclude_lines, filename, colors, summary, misspellings, exclude_lines,
file_opener, word_regex, context, options) file_opener, word_regex, ignore_word_regex, context, options)
if summary: if summary:
print("\n-------8<-------\nSUMMARY:") print("\n-------8<-------\nSUMMARY:")

View File

@ -455,6 +455,70 @@ def test_context(tmpdir, capsys):
assert 'ERROR' in lines[0] assert 'ERROR' in lines[0]
def test_uri(tmpdir, capsys):
"""Test ignore regex functionality for URIs."""
d = str(tmpdir)
# Ignoring text in path.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see http://example.com/abandonned for info\n')
assert cs.main(f.name) == 0
# Same is a typo with ignores disabled.
assert cs.main(f.name, '--ignore-regex=^$') == 1
# Test a different protocol.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see https://example.com/abandonned for info\n')
assert cs.main(f.name) == 0
# Ignoring text in path ending with /.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see http://example.com/abandonned/ for info\n')
assert cs.main(f.name) == 0
# Ignoring text in domain.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see http://abandonned.com/example for info\n')
assert cs.main(f.name) == 0
# Ignoring text in anchor.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see http://example.com/ex#abandonned for info\n')
assert cs.main(f.name) == 0
# Typo because there's no protocol.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see example.com/abandonned for info\n')
assert cs.main(f.name) == 1
# Typo because there aren't enough domain parts.
with open(op.join(d, 'uri.txt'), 'w') as f:
f.write('# Please see http://abandonned for info\n')
assert cs.main(f.name) == 1
def test_email(tmpdir, capsys):
"""Test ignore regex functionality for emails."""
d = str(tmpdir)
# Ignoring text in username.
with open(op.join(d, 'email.txt'), 'w') as f:
f.write('# Please contact abandonned@example.com for info\n')
assert cs.main(f.name) == 0
# Same is a typo with ignores disabled.
assert cs.main(f.name, '--ignore-regex=^$') == 1
# Ignoring text in domain.
with open(op.join(d, 'email.txt'), 'w') as f:
f.write('# Please contact example@abandonned.com for info\n')
assert cs.main(f.name) == 0
# Typo because there's no TLD for an email.
with open(op.join(d, 'email.txt'), 'w') as f:
f.write('# Please contact abandonned@example for info\n')
assert cs.main(f.name) == 1
@contextlib.contextmanager @contextlib.contextmanager
def FakeStdin(text): def FakeStdin(text):
if sys.version[0] == '2': if sys.version[0] == '2':