generate-rowcolumn-helpers.py

#!/usr/bin/env python3

# This script generates functions to convert row/column numbers encoded as
# diacritics to actual numbers.
# It reads the file rowcolumn-diacritics.txt from the currend directory and
# produces the following files:
# - rowcolumn_diacritics_helpers.c - contains a helper function to convert from
#   diacritics to row/column numbers.
# - rowcolumn_diacritics.sh - contains an array of row/column diacritics (can be
#   used by shell scripts to generate image placeholders).
#
# The script also checks some desirable properties of row/column diacritics,
# e.g. that image placeholders are in normal form.

import unicodedata
import sys

# codes of all row/column diacritics
codes = []

with open("./rowcolumn-diacritics.txt", "r") as file:
    for line in file.readlines():
        if line.startswith('#'):
            continue
        code = int(line.split(";")[0], 16)
        char = chr(code)
        assert unicodedata.combining(char) == 230
        codes.append(code)

print("Generating ./rowcolumn_diacritics_helpers.c")
with open("./rowcolumn_diacritics_helpers.c", "w") as file:
    range_start_num = 1
    range_start = 0
    range_end = 0

    def print_range():
        if range_start >= range_end:
            return
        for code in range(range_start, range_end):
            print("\tcase " + hex(code) + ":", file=file)
        print("\t\treturn code - " + hex(range_start) + " + " +
              str(range_start_num) + ";",
              file=file)

    print("#include <stdint.h>\n", file=file)
    print("uint16_t diacritic_to_num(uint32_t code)\n{", file=file)
    print("\tswitch (code) {", file=file)

    for code in codes:
        if range_end == code:
            range_end += 1
        else:
            print_range()
            range_start_num += range_end - range_start
            range_start = code
            range_end = code + 1
    print_range()

    print("\t}", file=file)
    print("\treturn 0;", file=file)
    print("}", file=file)

print("Generating ./rowcolumn_diacritics_helpers.rs")
with open("./rowcolumn_diacritics_helpers.rs", "w") as file:
    range_start_num = 1
    range_start = 0
    range_end = 0

    def print_range():
        if range_start >= range_end:
            return
        print("        " +
              "'\\u{" + hex(range_start)[2:] + "}'" + "..=" +
              "'\\u{" + hex(range_end - 1)[2:] + "}'" + " => " +
              "c as u32 - " + hex(range_start) + "u32 + " +
              str(range_start_num) + ",",
              file=file)

    print("pub fn diacritic_to_num(c: char) -> u32 {", file=file)
    print("    match c {", file=file)

    for code in codes:
        if range_end == code:
            range_end += 1
        else:
            print_range()
            range_start_num += range_end - range_start
            range_start = code
            range_end = code + 1
    print_range()

    print("        _ => 0", file=file)
    print("    }", file=file)
    print("}", file=file)

print("Generating ./rowcolumn_diacritics.sh")
with open("./rowcolumn_diacritics.sh", "w") as file:
    print("ROWCOLUMN_DIACRITICS=(", file=file, end="")
    for code in codes:
        print('"\\U' + format(code, 'x') + '" ', file=file, end="")
    print(")", file=file)

print("Checking that image placeholder cannot be normalized further")

img_char = chr(0x10EEEE)
for row_code in codes:
    row_char = chr(row_code)
    for col_code in codes:
        col_char = chr(col_code)
        cell = img_char + row_char + col_char
        for nf in ["NFC", "NFKC", "NFD", "NFKD"]:
            if not unicodedata.is_normalized(nf, cell):
                print(cell)
                print("unnormalized!", nf, [hex(ord(img_char)), hex(row_code), hex(col_code)])
                normalized = unicodedata.normalize(nf, cell)
                print("normalized:", [hex(ord(c)) for c in normalized])
                exit(1)

print("Checking that the row/column marks are not fused with anything "
      "letter-like during normalization")

# Collect somewhat normal characters.
normal_symbols = []
for i in range(sys.maxunicode):
    string = chr(i)
    if unicodedata.category(string)[0] not in ['L', 'P', 'N', 'S']:
        continue
    is_normalized = True
    for nf in ["NFC", "NFKC", "NFD", "NFKD"]:
        if not unicodedata.is_normalized(nf, string):
            is_normalized = False
    if is_normalized:
        normal_symbols.append(i)

for code in codes:
    print("Checking " + hex(code), end="\r")
    for num in normal_symbols:
        string = chr(num) + chr(code)
        for nf in ["NFC", "NFKC", "NFD", "NFKD"]:
            if not unicodedata.is_normalized(nf, string):
                normalized = unicodedata.normalize(nf, string)
                print("WARNING: " + hex(num) + " + " + hex(code) +
                      " is normalized to " + normalized,
                      " ".join(hex(ord(c)) for c in normalized))