-
Notifications
You must be signed in to change notification settings - Fork 34
/
repr
executable file
·102 lines (91 loc) · 4.33 KB
/
repr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
import itertools as it, operator as op, functools as ft
import os, sys, io, re, signal
def main(args=None):
import argparse, textwrap
dd = lambda text: re.sub( r' \t+', ' ',
textwrap.dedent(text).strip('\n') + '\n' ).replace('\t', ' ')
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description='Run python repr() to stdout on every (text) line of an input file.')
parser.add_argument('file', nargs='*',
help='Input file path(s). Stdin is used if none specified or in place of "-".')
parser.add_argument('-a', '--ascii', action='store_true',
help='Use ascii() representation instead of repr(), i.e. utf-8 chars as \\uXXXX.')
parser.add_argument('-b', '--binary', action='store_true',
help=dd('''
Process file lines in binary mode, instead of decoding it with utf-8.
Should allow to see BOM and random
binary goop escaped as individual bytes, not codepoints.
Anything outside of printable ascii will be encoded
either as a special escape sequence like \\n or C-style \\xCC.'''))
parser.add_argument('-n', '--hide-newlines', action='store_true',
help=dd('''
Enables "universal newlines" translation and strips last \\n from each line.
Whether this is sufficient for mismatched line endings
depends on how aggressive that mode is in converting newlines.'''))
parser.add_argument('-x', '--error-replace', action='store_true',
help='When decoding stuff (non-binary mode, convert), replace non-decodable chars.')
parser.add_argument('-c', '--convert',
const='utf-8', nargs='?', metavar='((enc-src:)enc-dst)(+opts)',
help=dd('''
Instead of repr-output, convert encoding
from enc-src (default=utf-8-sig) to enc-dst (utf-8) and output the result.
[[enc-src:]enc-dst] spec can be followed by + and any of the following options:
+r - output windows \\r\\n newlines,
+i - overwrite file in-place, same as sed -i would do
+t - trim whitespace from line endings, and ensure final newline
+j - json-decode and pretty-print lines, or print as -b/--binary on fail
Examples: -c utf-8-sig+ri, -c +t, -c cp1251:utf-8+it
Allows to use stuff like utf-8-sig or any other codec supported by python.'''))
opts = parser.parse_args(sys.argv[1:] if args is None else args)
file_list, file_nl = opts.file, None
if not file_list: file_list = ['-']
dst_stdout = open(sys.stdout.fileno(), 'wb')
dec_errors = 'strict' if not opts.error_replace else 'replace'
if opts.convert:
enc_src, enc_dst, enc_opts = 'utf-8-sig', opts.convert, ''
if ':' in enc_dst: enc_src, enc_dst = enc_dst.split(':', 1)
if '+' in enc_dst: enc_dst, enc_opts = enc_dst.split('+', 1)
if not enc_src: enc_src = 'utf-8-sig'
if not enc_dst: enc_dst = 'utf-8'
assert not set(enc_opts).difference('ritj'),\
f'Not all encoding options recognized: {enc_opts}'
if 'j' in enc_opts: import json, pprint
for sig in 'SIGINT', 'SIGTERM':
signal.signal(getattr(signal, sig), lambda sig,frm: sys.exit(os.EX_OK))
for p in file_list:
p_file, line_end_final = p != '-', None
dst_buff = io.BytesIO() if opts.convert and 'i' in enc_opts and p_file else None
with open(p if p_file else sys.stdin.fileno(), mode='rb') as src:
for line in src:
if opts.convert:
line = line.decode(enc_src, dec_errors).encode(enc_dst)
while b'\r\n' in line: line = line.replace(b'\r\n', b'\n')
if 't' in enc_opts:
line_end_final = line.endswith(b'\n')
line = b'\n'.join(it.chain(
(line.rstrip() for line in line.rstrip().split(b'\n')) ))
if 'r' in enc_opts: line = line.replace(b'\n', b'\r\n')
if 'j' in enc_opts:
try:
line = (pprint.pformat( json.loads(line),
indent=1, width=120, compact=True ) + '\n').encode()
except: line = (repr(line)[2:-1] + '\n').encode()
dst = dst_buff or dst_stdout
dst.write(line)
dst.flush()
else:
if opts.hide_newlines: line = line.rstrip(b'\r\n')
if opts.binary: line = repr(line)[2:-1] + '\n'
else:
line = (ascii if opts.ascii else repr)(
line.decode('utf-8-sig', dec_errors) )[1:-1] + '\n'
dst_stdout.write(line.encode())
dst_stdout.flush()
if opts.convert:
if 't' in enc_opts and line_end_final is False:
(dst_buff or dst_stdout).write(b'\n')
if dst_buff:
with open(p, 'wb') as dst: dst.write(dst_buff.getvalue())
if __name__ == '__main__': sys.exit(main())