diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 7d490c66..edf259e2 100755 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -28,8 +28,11 @@ import sys import textwrap word_regex_def = u"[\\w\\-'’`]+" -uri_regex_def = (u"\\b((?:https?|t?ftp|file|git|smb)://[^\\s'\"]*|" - u"[\\w.%+-]+@[\\w.-]+)\\b") +# While we want to treat characters like ( or " as okay for a starting break, +# these may occur unescaped in URIs, and so we are more restrictive on the +# endpoint. Emails are more restrictive, so the endpoint remains flexible. +uri_regex_def = (u"(\\b(?:https?|t?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" + u"\\b[\\w.%+-]+@[\\w.-]+\\b)") encodings = ('utf-8', 'iso-8859-1') USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -307,6 +310,10 @@ def parse_options(args): 'underscore, the hyphen, and the apostrophe is ' 'used to build words. This option cannot be ' 'specified together with --write-changes.') + parser.add_argument('--uri-regex', + action='store', type=str, + help='regular expression which is used to find URIs ' + 'and emails. A default expression is provided.') parser.add_argument('-s', '--summary', action='store_true', default=False, help='print summary of fixes') @@ -731,7 +738,14 @@ def main(*args): return EX_USAGE build_ignore_words(ignore_words_file, ignore_words) - uri_regex = re.compile(uri_regex_def) + uri_regex = options.uri_regex or uri_regex_def + try: + uri_regex = re.compile(uri_regex) + except re.error as err: + print("ERROR: invalid --uri-regex \"%s\" (%s)" % + (uri_regex, err), file=sys.stderr) + parser.print_help() + return EX_USAGE uri_ignore_words = parse_ignore_words_option(options.uri_ignore_words_list) if options.dictionary: diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 931c548d..f510c1d6 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -6,6 +6,7 @@ import contextlib import inspect import os import os.path as op +import re from shutil import copyfile import subprocess import sys @@ -13,7 +14,7 @@ import sys import pytest import codespell_lib as cs_ -from codespell_lib._codespell import EX_USAGE, EX_OK, EX_DATAERR +from codespell_lib._codespell import uri_regex_def, EX_USAGE, EX_OK, EX_DATAERR def test_constants(): @@ -488,6 +489,40 @@ def test_ignore_regex_option(tmpdir, capsys): assert cs.main(f.name, r'--ignore-regex=\bdonn\b') == 1 +def test_uri_regex_option(tmpdir, capsys): + """Test --uri-regex option functionality.""" + d = str(tmpdir) + + # Invalid regex. + code, stdout, _ = cs.main('--uri-regex=(', std=True) + assert code == EX_USAGE + assert 'usage:' in stdout + + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('# Please see http://abandonned.com for info\n') + + # By default, the standard regex is used. + assert cs.main(f.name) == 1 + assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0 + + # If empty, nothing matches. + assert cs.main(f.name, '--uri-regex=', + '--uri-ignore-words-list=abandonned') == 0 + + # Can manually match urls. + assert cs.main(f.name, '--uri-regex=\\bhttp.*\\b', + '--uri-ignore-words-list=abandonned') == 0 + + # Can also match arbitrary content. + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('abandonned') + assert cs.main(f.name) == 1 + assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1 + assert cs.main(f.name, '--uri-regex=.*') == 1 + assert cs.main(f.name, '--uri-regex=.*', + '--uri-ignore-words-list=abandonned') == 0 + + def test_uri_ignore_words_list_option_uri(tmpdir, capsys): """Test ignore regex option functionality.""" d = str(tmpdir) @@ -522,6 +557,7 @@ def test_uri_ignore_words_list_option_uri(tmpdir, capsys): # Variations where an error is ignored. for variation in ('# Please see http://abandonned for info\n', '# Please see "http://abandonned" for info\n', + '# Please see "http://foo"abandonned for info\n', '# Please see https://abandonned for info\n', '# Please see ftp://abandonned for info\n', '# Please see http://example/abandonned for info\n', @@ -540,8 +576,7 @@ def test_uri_ignore_words_list_option_uri(tmpdir, capsys): for variation in ('# Please see abandonned/ for info\n', '# Please see http:abandonned for info\n', '# Please see foo/abandonned for info\n', - '# Please see http://foo abandonned for info\n', - '# Please see "http://foo"abandonned for info\n'): + '# Please see http://foo abandonned for info\n'): with open(op.join(d, 'flag.txt'), 'w') as f: f.write(variation) assert cs.main(f.name) == 1, variation @@ -596,6 +631,99 @@ def test_uri_ignore_words_list_option_email(tmpdir, capsys): assert cs.main(f.name, variation_option) == 1, variation +def test_uri_regex_def(): + uri_regex = re.compile(uri_regex_def) + + # Tests based on https://mathiasbynens.be/demo/url-regex + true_positives = ( + 'http://foo.com/blah_blah', + 'http://foo.com/blah_blah/', + 'http://foo.com/blah_blah_(wikipedia)', + 'http://foo.com/blah_blah_(wikipedia)_(again)', + 'http://www.example.com/wpstyle/?p=364', + 'https://www.example.com/foo/?bar=baz&inga=42&quux', + 'http://✪df.ws/123', + 'http://userid:password@example.com:8080', + 'http://userid:password@example.com:8080/', + 'http://userid@example.com', + 'http://userid@example.com/', + 'http://userid@example.com:8080', + 'http://userid@example.com:8080/', + 'http://userid:password@example.com', + 'http://userid:password@example.com/', + 'http://142.42.1.1/', + 'http://142.42.1.1:8080/', + 'http://➡.ws/䨹', + 'http://⌘.ws', + 'http://⌘.ws/', + 'http://foo.com/blah_(wikipedia)#cite-1', + 'http://foo.com/blah_(wikipedia)_blah#cite-1', + 'http://foo.com/unicode_(✪)_in_parens', + 'http://foo.com/(something)?after=parens', + 'http://☺.damowmow.com/', + 'http://code.google.com/events/#&product=browser', + 'http://j.mp', + 'ftp://foo.bar/baz', + 'http://foo.bar/?q=Test%20URL-encoded%20stuff', + 'http://مثال.إختبار', + 'http://例子.测试', + 'http://उदाहरण.परीक्षा', + "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", + 'http://1337.net', + 'http://a.b-c.de', + 'http://223.255.255.254', + ) + true_negatives = ( + 'http://', + '//', + '//a', + '///a', + '///', + 'foo.com', + 'rdar://1234', + 'h://test', + '://should.fail', + 'ftps://foo.bar/', + ) + false_positives = ( + 'http://.', + 'http://..', + 'http://../', + 'http://?', + 'http://??', + 'http://??/', + 'http://#', + 'http://##', + 'http://##/', + 'http:///a', + 'http://-error-.invalid/', + 'http://a.b--c.de/', + 'http://-a.b.co', + 'http://a.b-.co', + 'http://0.0.0.0', + 'http://10.1.1.0', + 'http://10.1.1.255', + 'http://224.1.1.1', + 'http://1.1.1.1.1', + 'http://123.123.123', + 'http://3628126748', + 'http://.www.foo.bar/', + 'http://www.foo.bar./', + 'http://.www.foo.bar./', + 'http://10.1.1.1', + ) + + boilerplate = 'Surrounding text %s more text' + + for uri in true_positives + false_positives: + assert uri_regex.findall(uri) == [uri], uri + assert uri_regex.findall(boilerplate % uri) == [uri], uri + + for uri in true_negatives: + assert not uri_regex.findall(uri), uri + assert not uri_regex.findall(boilerplate % uri), uri + + @contextlib.contextmanager def FakeStdin(text): if sys.version[0] == '2':