import os import re import sys def check_encoding(encoding, scan_dir, regex_pattern): fname = None try: assert encoding in ['ascii', 'utf-8'], "unexpected encoding" cmp = re.compile(regex_pattern) for root, dirs, files in os.walk(scan_dir): fname = root cmp_list = [f for f in files if cmp.search(f) is not None] for f in cmp_list: fname = os.path.join(root, f) with open(fname, mode='rb') as test_file: btext = test_file.read() # check encoding btext.decode(encoding=encoding, errors="strict") if encoding == "utf-8" and btext.startswith(b'\xEF\xBB\xBF'): raise ValueError("unexpected BOM in file") # check strict CRLF line-ending LF = btext.count(b'\r') CRLF = btext.count(b'\r\n') assert LF >= CRLF, "CRLF logic error" if CRLF != LF: raise ValueError("CRLF violation: found {} LF characters".format(LF - CRLF)) except Exception as err: print("ERROR with [{}]: {}".format(fname, err)) return -1 else: return 0 if __name__ == "__main__": # python check-sources.sh.py 'ascii' '.' '.*\.(cpp|h)$' res = check_encoding(sys.argv[1], sys.argv[2], sys.argv[3]) sys.exit(0 if res == 0 else -1)