toHiragana

#!/usr/bin/env python3
#
# toHiragana: Simple transliteration to help practice.
# 2013-03-21: Written by Steven J. DeRose.
#
import sys
import os
import re
import codecs

__metadata__ = {
    "title"        : "toHiragana.py",
    "description"  : "Simple transliteration to help practice.",
    "rightsHolder" : "Steven J. DeRose",
    "creator"      : "http://viaf.org/viaf/50334488",
    "type"         : "http://purl.org/dc/dcmitype/Software",
    "language"     : "Python 3.7",
    "created"      : "2013-03-21",
    "modified"     : "2024-02-21",
    "publisher"    : "http://github.com/sderose",
    "license"      : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__['modified']

descr = """
=head1 Usage

toHiragana [options]

Transliterate ASCII to (Unicode) hiragana, for practice.

This is rudimentary, and works purely from spelling, not pronunciation.
By default, it leaves [cfjlqvx] and ph along; but see I<--loose>.
It also doesn't do anything for consonant-clusters and word-final
consonants ('splat' will end up unchanged except for the 'a').

=head1 Options

=over

* ''--loose''
Do something about other Latin characters:
[cfjlqvx] and ph are mapped
('c' goes to 's' or 'k' depending on the following vowel).
I don't know if there's a convention for other English combinations
such as 'sh', 'ng', etc.

* ''--quiet'' OR ''-q'' Suppress most messages.

* ''--verbose'' Add more detailed messages (doesn't do much at the moment).

* ''--version'' Display version info and exit.

=head1 Related Commands

=head1 Known bugs and limitations

=History=

* 2013-03-21: Written by Steven J. DeRose.
* 2024-02-21: Drop PY2.

=Rights=

Copyright 2013 by Steven J. DeRose. This work is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see [http://creativecommons.org/licenses/by-sa/3.0].

For the most recent version, see [http://www.derose.net/steve/utilities] or
[http://github.com/sderose].
"""

hiraganaLetters = {
    "A":  chr(0X03042),
    "I":  chr(0X03044),
    "U":  chr(0X03046),
    "E":  chr(0X03048),
    "O":  chr(0X0304A),
    "KA": chr(0X0304B),
    "GA": chr(0X0304C),
    "KI": chr(0X0304D),
    "GI": chr(0X0304E),
    "KU": chr(0X0304F),
    "GU": chr(0X03050),
    "KE": chr(0X03051),
    "GE": chr(0X03052),
    "KO": chr(0X03053),
    "GO": chr(0X03054),
    "SA": chr(0X03055),
    "ZA": chr(0X03056),
    "SI": chr(0X03057),
    "ZI": chr(0X03058),
    "SU": chr(0X03059),
    "ZU": chr(0X0305A),
    "SE": chr(0X0305B),
    "ZE": chr(0X0305C),
    "SO": chr(0X0305D),
    "ZO": chr(0X0305E),
    "TA": chr(0X0305F),
    "DA": chr(0X03060),
    "TI": chr(0X03061),
    "DI": chr(0X03062),
    "TU": chr(0X03064),
    "DU": chr(0X03065),
    "TE": chr(0X03066),
    "DE": chr(0X03067),
    "TO": chr(0X03068),
    "DO": chr(0X03069),
    "NA": chr(0X0306A),
    "NI": chr(0X0306B),
    "NU": chr(0X0306C),
    "NE": chr(0X0306D),
    "NO": chr(0X0306E),
    "HA": chr(0X0306F),
    "BA": chr(0X03070),
    "PA": chr(0X03071),
    "HI": chr(0X03072),
    "BI": chr(0X03073),
    "PI": chr(0X03074),
    "HU": chr(0X03075),
    "BU": chr(0X03076),
    "PU": chr(0X03077),
    "HE": chr(0X03078),
    "BE": chr(0X03079),
    "PE": chr(0X0307A),
    "HO": chr(0X0307B),
    "BO": chr(0X0307C),
    "PO": chr(0X0307D),
    "MA": chr(0X0307E),
    "MI": chr(0X0307F),
    "MU": chr(0X03080),
    "ME": chr(0X03081),
    "MO": chr(0X03082),
    "YA": chr(0X03084),
    "YU": chr(0X03086),
    "YO": chr(0X03088),
    "RA": chr(0X03089),
    "RI": chr(0X0308A),
    "RU": chr(0X0308B),
    "RE": chr(0X0308C),
    "RO": chr(0X0308D),
    "WA": chr(0X0308F),
    "WI": chr(0X03090),
    "WE": chr(0X03091),
    "WO": chr(0X03092),
    "N":  chr(0X03093),
    "VU": chr(0X03094),
}

hiraganaSmallLetters = {
    "A":  chr(0X03041),
    "I":  chr(0X03043),
    "U":  chr(0X03045),
    "E":  chr(0X03047),
    "O":  chr(0X03049),
    "TU": chr(0X03063),
    "YA": chr(0X03083),
    "YU": chr(0X03085),
    "YO": chr(0X03087),
    "WA": chr(0X0308E),
    "KA": chr(0X03095),
    "KE": chr(0X03096),
}

hiraganaOthers = {
    "COMBINING KATAKANA-VOICED SOUND MARK": chr(0X03099),
    "COMBINING KATAKANA-SEMI-VOICED SOUND MARK": chr(0X0309A),
    "KATAKANA-VOICED SOUND MARK":           chr(0X0309B),
    "KATAKANA-SEMI-VOICED SOUND MARK":      chr(0X0309C),
    "ITERATION MARK":                       chr(0X0309D),
    "VOICED ITERATION MARK":                chr(0X0309E),
    "DIGRAPH YORI":                         chr(0X0309F),
    "KATAKANA-HIRAGANA DOUBLE HYPHEN":      chr(0X030A0),
    "KATAKANA-HIRAGANA PROLONGED SOUND MARK": chr(0X030FC),
    "HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK": chr(0X0FF70),
    "LETTER ARCHAIC YE":                    chr(0x01B001),
    "SQUARE HIRAGANA HOKA":                 chr(0x01F200),
}

def hiragana(s):
    if (args.loose):
        s = re.sub(r'c([ie])',  's\\1', s)
        s = re.sub(r'c([aou])', 'k\\1', s)
        s = re.sub(r'ph|f',     'p',    s)
        s = re.sub(r'j',        'zh',   s)
        s = re.sub(r'l',        'r',    s)
        s = re.sub(r'q',        'kaw',  s)
        s = re.sub(r'v',        'b',    s)
        s = re.sub(r'x',        'kas',  s)

    buf = ""
    for i in (range(0, len(s))):
        c1 = s[i:i+1].upper()
        c2 = s[i:i+2].upper()
        #print("Checking '" + c2 + "'")
        if (c2 in hiraganaLetters):
            #print("    got 2")
            buf += hiraganaLetters[c2]
            i += 1
        elif (c1 in hiraganaLetters):
            #print("    got 1")
            buf += hiraganaLetters[c1]
        else:
            buf += s[i:i+1]
    return(buf)


###############################################################################
#
def doOneFile(fh):
    rec = ""
    recnum = 0

    while (1):
        rec = fh.readline()
        if (len(rec) == 0): break
        recnum += 1
        print(hiragana(rec))
    # EOF
    return(recnum)


###############################################################################
# Main
#
if __name__ == "__main__":
    import argparse

    def processOptions() -> argparse.Namespace:
        try:
            from BlockFormatter import BlockFormatter
            parser = argparse.ArgumentParser(
                description=descr, formatter_class=BlockFormatter)
        except ImportError:
            parser = argparse.ArgumentParser(description=descr)

        parser = argparse.ArgumentParser(
            description=descr)

        parser.add_argument(
            "--iencoding", type=str, metavar="E", default="utf-8",
            help="Assume this character coding for input. Default: utf-8.")
        parser.add_argument(
            "-loose", action='store_true',
            help='Do some extra mappings for other Latin characters.')
        parser.add_argument(
            "-q", action='store_true', dest='quiet',
            help='Suppress most messages.')
        parser.add_argument(
            "-verbose", action='count', default=0,
            help='Add more messages (repeatable).')
        parser.add_argument(
            '-version', action='version', version='Version of '+__version__,
            help='Display version information, then exit.')
        parser.add_argument(
            'files', type=str, nargs=argparse.REMAINDER,
            help='Path(s) to input file(s)')

        args0 = parser.parse_args()

        return(args0)

args = processOptions()

totalRecords = 0
totalFiles = 0

if (len(args.files) == 0):
    print("No files specified")
    sys.exit()

for fnum in (range(len(args.files))):
    totalFiles += 1
    f = args.files[fnum]
    if (os.path.isfile(f)):
        fh0 = codecs.open(f, "rw", encoding=args.iencoding)
        totalRecords += doOneFile(fh0)
        fh0.close()
    else:
        print("Can't find file '" + f + "'.")

if (not args.quiet):
    print("Done, %d files, %d records." % (totalFiles, totalRecords))

sys.exit(0)