-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate-rowcolumn-helpers.py
executable file
·144 lines (124 loc) · 4.81 KB
/
generate-rowcolumn-helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
# This script generates functions to convert row/column numbers encoded as
# diacritics to actual numbers.
# It reads the file rowcolumn-diacritics.txt from the currend directory and
# produces the following files:
# - rowcolumn_diacritics_helpers.c - contains a helper function to convert from
# diacritics to row/column numbers.
# - rowcolumn_diacritics.sh - contains an array of row/column diacritics (can be
# used by shell scripts to generate image placeholders).
#
# The script also checks some desirable properties of row/column diacritics,
# e.g. that image placeholders are in normal form.
import unicodedata
import sys
# codes of all row/column diacritics
codes = []
with open("./rowcolumn-diacritics.txt", "r") as file:
for line in file.readlines():
if line.startswith('#'):
continue
code = int(line.split(";")[0], 16)
char = chr(code)
assert unicodedata.combining(char) == 230
codes.append(code)
print("Generating ./rowcolumn_diacritics_helpers.c")
with open("./rowcolumn_diacritics_helpers.c", "w") as file:
range_start_num = 1
range_start = 0
range_end = 0
def print_range():
if range_start >= range_end:
return
for code in range(range_start, range_end):
print("\tcase " + hex(code) + ":", file=file)
print("\t\treturn code - " + hex(range_start) + " + " +
str(range_start_num) + ";",
file=file)
print("#include <stdint.h>\n", file=file)
print("uint16_t diacritic_to_num(uint32_t code)\n{", file=file)
print("\tswitch (code) {", file=file)
for code in codes:
if range_end == code:
range_end += 1
else:
print_range()
range_start_num += range_end - range_start
range_start = code
range_end = code + 1
print_range()
print("\t}", file=file)
print("\treturn 0;", file=file)
print("}", file=file)
print("Generating ./rowcolumn_diacritics_helpers.rs")
with open("./rowcolumn_diacritics_helpers.rs", "w") as file:
range_start_num = 1
range_start = 0
range_end = 0
def print_range():
if range_start >= range_end:
return
print(" " +
"'\\u{" + hex(range_start)[2:] + "}'" + "..=" +
"'\\u{" + hex(range_end - 1)[2:] + "}'" + " => " +
"c as u32 - " + hex(range_start) + "u32 + " +
str(range_start_num) + ",",
file=file)
print("pub fn diacritic_to_num(c: char) -> u32 {", file=file)
print(" match c {", file=file)
for code in codes:
if range_end == code:
range_end += 1
else:
print_range()
range_start_num += range_end - range_start
range_start = code
range_end = code + 1
print_range()
print(" _ => 0", file=file)
print(" }", file=file)
print("}", file=file)
print("Generating ./rowcolumn_diacritics.sh")
with open("./rowcolumn_diacritics.sh", "w") as file:
print("ROWCOLUMN_DIACRITICS=(", file=file, end="")
for code in codes:
print('"\\U' + format(code, 'x') + '" ', file=file, end="")
print(")", file=file)
print("Checking that image placeholder cannot be normalized further")
img_char = chr(0x10EEEE)
for row_code in codes:
row_char = chr(row_code)
for col_code in codes:
col_char = chr(col_code)
cell = img_char + row_char + col_char
for nf in ["NFC", "NFKC", "NFD", "NFKD"]:
if not unicodedata.is_normalized(nf, cell):
print(cell)
print("unnormalized!", nf, [hex(ord(img_char)), hex(row_code), hex(col_code)])
normalized = unicodedata.normalize(nf, cell)
print("normalized:", [hex(ord(c)) for c in normalized])
exit(1)
print("Checking that the row/column marks are not fused with anything "
"letter-like during normalization")
# Collect somewhat normal characters.
normal_symbols = []
for i in range(sys.maxunicode):
string = chr(i)
if unicodedata.category(string)[0] not in ['L', 'P', 'N', 'S']:
continue
is_normalized = True
for nf in ["NFC", "NFKC", "NFD", "NFKD"]:
if not unicodedata.is_normalized(nf, string):
is_normalized = False
if is_normalized:
normal_symbols.append(i)
for code in codes:
print("Checking " + hex(code), end="\r")
for num in normal_symbols:
string = chr(num) + chr(code)
for nf in ["NFC", "NFKC", "NFD", "NFKD"]:
if not unicodedata.is_normalized(nf, string):
normalized = unicodedata.normalize(nf, string)
print("WARNING: " + hex(num) + " + " + hex(code) +
" is normalized to " + normalized,
" ".join(hex(ord(c)) for c in normalized))