pre-commit-hooks/pre_commit_hooks/fix_encoding_pragma.py

from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import collections

DEFAULT_PRAGMA = b'# -*- coding: utf-8 -*-\n'


def has_coding(line):
    if not line.strip():
        return False
    return (
        line.lstrip()[0:1] == b'#' and (
            b'unicode' in line or
            b'encoding' in line or
            b'coding:' in line or
            b'coding=' in line
        )
    )


class ExpectedContents(collections.namedtuple(
        'ExpectedContents', ('shebang', 'rest', 'pragma_status'),
)):
    """
    pragma_status:
    - True: has exactly the coding pragma expected
    - False: missing coding pragma entirely
    - None: has a coding pragma, but it does not match
    """
    __slots__ = ()

    @property
    def has_any_pragma(self):
        return self.pragma_status is not False

    def is_expected_pragma(self, remove):
        expected_pragma_status = not remove
        return self.pragma_status is expected_pragma_status


def _get_expected_contents(first_line, second_line, rest, expected_pragma):
    if first_line.startswith(b'#!'):
        shebang = first_line
        potential_coding = second_line
    else:
        shebang = b''
        potential_coding = first_line
        rest = second_line + rest

    if potential_coding == expected_pragma:
        pragma_status = True
    elif has_coding(potential_coding):
        pragma_status = None
    else:
        pragma_status = False
        rest = potential_coding + rest

    return ExpectedContents(
        shebang=shebang, rest=rest, pragma_status=pragma_status,
    )


def fix_encoding_pragma(f, remove=False, expected_pragma=DEFAULT_PRAGMA):
    expected = _get_expected_contents(
        f.readline(), f.readline(), f.read(), expected_pragma,
    )

    # Special cases for empty files
    if not expected.rest.strip():
        # If a file only has a shebang or a coding pragma, remove it
        if expected.has_any_pragma or expected.shebang:
            f.seek(0)
            f.truncate()
            f.write(b'')
            return 1
        else:
            return 0

    if expected.is_expected_pragma(remove):
        return 0

    # Otherwise, write out the new file
    f.seek(0)
    f.truncate()
    f.write(expected.shebang)
    if not remove:
        f.write(expected_pragma)
    f.write(expected.rest)

    return 1


def _normalize_pragma(pragma):
    if not isinstance(pragma, bytes):
        pragma = pragma.encode('UTF-8')
    return pragma.rstrip() + b'\n'


def _to_disp(pragma):
    return pragma.decode().rstrip()


def main(argv=None):
    parser = argparse.ArgumentParser('Fixes the encoding pragma of python files')
    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
    parser.add_argument(
        '--pragma', default=DEFAULT_PRAGMA, type=_normalize_pragma,
        help='The encoding pragma to use.  Default: {}'.format(
            _to_disp(DEFAULT_PRAGMA),
        ),
    )
    parser.add_argument(
        '--remove', action='store_true',
        help='Remove the encoding pragma (Useful in a python3-only codebase)',
    )
    args = parser.parse_args(argv)

    retv = 0

    if args.remove:
        fmt = 'Removed encoding pragma from {filename}'
    else:
        fmt = 'Added `{pragma}` to {filename}'

    for filename in args.filenames:
        with open(filename, 'r+b') as f:
            file_ret = fix_encoding_pragma(
                f, remove=args.remove, expected_pragma=args.pragma,
            )
            retv |= file_ret
            if file_ret:
                print(fmt.format(
                    pragma=_to_disp(args.pragma), filename=filename,
                ))

    return retv

if __name__ == "__main__":
    exit(main())