chr

#!/usr/bin/env perl -w
#
# chr: Show char for a given code point number.
# 2007-10-29: Written by Steven J. DeRose.
#
use strict;
use Getopt::Long;
use Encode;
use charnames ':full';
use Unicode::UCD 'charscript';
use Unicode::UCD 'charblock';
use HTML::Entities;
#use Encode::Escape; #::Unicode;

use alogging;

our %metadata = (
    'title'        => "chr",
    'description'  => "Retrieve lots of data on Unicode characters.",
    'rightsHolder' => "Steven J. DeRose",
    'creator'      => "http://viaf.org/viaf/50334488",
    'type'         => "http://purl.org/dc/dcmitype/Software",
    'language'     => "Perl 5.18",
    'created'      => "2007-10-29",
    'modified'     => "2024-02-26",
    'publisher'    => "http://github.com/sderose",
    'license'      => "https://creativecommons.org/licenses/by-sa/3.0/"
);
our $VERSION_DATE = $metadata{'modified'};

=pod

=head1 Usage

chr [options] [nums]

Display information about the character(s)
corresponding to the code point(s) number(s) in I<nums>.

For example, "chr 0x2203" produces:

  0x2203:
    Bases:           o21003 d8707 x2203; U+2203, utf-8 \xe2\x88\x83
    Unicode Name:    THERE EXISTS
    Unicode Script:  Common
    Unicode Block:   Mathematical Operators
    Unicode Plane:   0: Basic Multilingual
    URI form:        %e2%88%83
    XML forms:       &#8707; &#x2203; &exist;

I<nums> may be in hex (0x...), octal (0...), binary (0b...), or decimal.
With the I<--utf8> option, you can give I<nums> as hex UTF8.

Control characters and spaces will be displayed as mnemonics.
Tries to get the full Unicode char name for chars >255.
Display of other characters depends on your terminal program.


=head1 Options

(prefix 'no' to invert when applicable)

=over

=item * B<--cp1252>

Show Windows Code Page 1252 meanings of characters d128-d159 (the rest
of CP1252 matches Latin-1).

=item * B<--iencoding> I<e>

Assume the output is in character set I<e>.
Not yet supported.
See also I<--listEncodings>, I<--cp1252>.

=item * B<--listEncodings>

Show all the encodings supported by I<--iencoding>, and exit.

=item * B<--long>

Give long names for control characters, instead of mnemonics.

=item * B<--quiet> OR B<-q>

Suppress most messages.

=item * B<--utf8>

Interpret the command-line numbers as representations of UTF8 (the
most common representation for Unicode).

The input tokens can be in hexadecimal or decimal. For example,
the C<Left Double Quotation Mark> character is Unicode code point
U+201C (or o20034, d8220, x201c). In UTF8, all characters > 127 are encoded
as multiple bytes, in this case the 3 byte sequence \xe2, \x80, \x9c. To
use C<chr> to identify this sequence, do:

    chr --utf8 0xe2809c
    chr --utf8 226.128.153

With decimal, there must be exactly 9 digits, optionally separated into
3 groups of 3 using non-word-characters (to use space, quote the argument).
This limits the range of Unicode supported for decimal.

You can enter a UTF hex sequence that represents more than one character.
C<chr> will find the boundaries, and describe each character in turn.
However, C<chr> cannot do the same for decimal representations of more than
1 character together.

=item * B<--verbose> OR B<-v>

Add more detailed messages.

=item * B<--version>

Display version info and exit.

=back


=head1 Known bugs and limitations

Some terminal programs assume Latin-1, ASCII, or even CP1252,
while Perl most readily writes out Unicode (as utf8).
So output of literal characters outside the ASCII range,
going to a non-Unicode-supporting terminal, may look funny.


=head1 Related commands

C<CharDisplay> Python utility to just print a ton of data about a given
code point.

C<ord> -- Do the reverse (that is, find the numeric code point(s)
for given character(s).

C<countChars> -- Find and/or count characters in particular ranges,
including XML character references, URI escapes, etc.

C<charnames> -- CPAN package to deal with Unicode properties and names.

C<showNumberInBases> -- takes numbers in any of several forms, and shows them
in multiple forms, much like this script also does with I<--nums>.

C<XmlTuples.pm> -- Parse various internal data about characters (optional).


=head1 History
    2007-10-29: Written by Steven J. DeRose.
    2008-01-02 sjd: Add -n. Fix name access for C1 range.
    2008-09-03 sjd: Move to BSD.
    2010-01-06 sjd: Add charnames for Unicode.
    2010-05-03 sjd: perldoc. Start fixing base recognition.
    2011-06-22 sjd: Fix bug handling unknown Unicode char names.
    2011-06-29 sjd: Eliminate -multiple option. Fix decimal input.
    2011-08-04 sjd: Support -cp1252. Fix oct() usage. Check Unicode max.
    2012-01-27 sjd: Keep shifting data to use XmlTuples.
    2012-02-28 sjd: Last of XmlTuples integration. Recognize C0 abbrs.
    2012-07-27 sjd: Trap bad unicode char in isURIchar().
    2012-08-13f sjd: Better message if arg isn't numeric. Do URI form.
Add HTML::Entities. Clean up display.
    2013-01-14 sjd: Add Unicode script and block.
    2013-06-19: Add Unicode equivalents for CP1252 chars.
    2013-08-19: Add utf-8 input:
        e28098 = u+2018 = lsquo
        e28099 = u+2019 = rsquo
        e2809c = u+201C = ldquo
        e2809d = u+201D = rdquo
    2015-07-07: Allow decimal "utf8" input: \226\128\153, etc.
    2015-09-15: Make -q useful. Make table of plane names.
    2024-02-26: Drop XSV data on MacRoman etc. New layout.


=head1 To do
    Finish losing XmlTuples, c0longNames, etc.
    Accept HTML named character entities.
    Finish --iencoding, --mac.
    Support input of UTF-8 byte sequences?
    Option to display all Unicode properties?
    Integrate with 'ord'?
    Accept ^A...^Z, ^@.


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see L<http://creativecommons.org/licenses/by-sa/3.0/>.

For the most recent version, see L<http://www.derose.net/steve/utilities/>.

=cut

# Names of the Unicode Planes.
my @uPlanes = ();
$uPlanes[ 0] = [ "Basic Multilingual",                "Prime Material" ];
$uPlanes[ 1] = [ "Supplementary Multilingual",        "Celestia" ];
$uPlanes[ 2] = [ "Supplementary Ideographic",         "Bytopia" ];
$uPlanes[ 3] = [ "Unassigned 3",                      "Elysium" ];
$uPlanes[ 4] = [ "Unassigned 4",                      "Beastlands" ];
$uPlanes[ 5] = [ "Unassigned 5",                      "Arborea" ];
$uPlanes[ 6] = [ "Unassigned 6",                      "Ysgard" ];
$uPlanes[ 7] = [ "Unassigned 7",                      "Limbo" ];
$uPlanes[ 8] = [ "Unassigned 8",                      "Pandemonium" ];
$uPlanes[ 9] = [ "Unassigned 9",                      "Abyss" ];
$uPlanes[10] = [ "Unassigned 10",                     "Carceri" ];
$uPlanes[11] = [ "Unassigned 11",                     "Hades" ];
$uPlanes[12] = [ "Unassigned 12",                     "Gehenna" ];
$uPlanes[13] = [ "Unassigned 13",                     "Baator" ];
$uPlanes[14] = [ "Supplementary Special-purpose",     "Acheron" ];
$uPlanes[15] = [ "Supplementary Private Use Area A",  "Mechanus" ];
$uPlanes[16] = [ "Supplementary Private Use Area B",  "Arcadia" ];


###############################################################################
# These are from sjdUtils.pm.
#
# Plane 0     0x000000 to 0x00FFFF  BMP
# Plane 1     0x010000 to 0x01FFFF  Suppl MP
# Plane 2     0x020000 to 0x02FFFF  Supple Ideographic
# Plane 3-13  0x030000 to 0x0DFFFF  Unassigned
# Plane 14    0x0E0000 to 0x0EFFFF  Suppl Special-purpose
# Plane 15-16 0x0F0000 to 0x10FFFF  Suppl Private Use Area
#
sub isUnicodeCodePoint {
    my ($n) = @_;
    if ($n  < 0x000000 || $n  > 0x10FFFF ||
        $n == 0x00FFFE || $n == 0x00FFFF ||
        ($n >= 0x000080 && $n < 0x0000a0)
        ) { return(0); }
    return(1);
}

my $xmlCharExpr =
    "\t\n\r\x20-\x{D7FF}\x{E000}-\x{FFFD}";
    #"\x{00010000}-\x{0010FFFF}";
my $xce      = qr/^[$xmlCharExpr]$/;
sub isXmlChar { # XML REC, production 2
    my ($c) = @_;
    return(($c =~ m/$xce/) ? 1:0);
}

sub Xtry_module {
    my ($mod, $quiet) = @_;
    eval("use $mod");
    if ($@) {
        ($quiet) || warn
            "try_module: Couldn't find Perl module '$mod'\n";
        return(0);
    }
    return(1);
}

sub pline {
    my ($label, $data) = @_;
    printf("    %-16s %s\n", $label, $data || "");
}

# Return the UTF-8 byte sequences for a given character code,
# punctuated as needed to put in a URI.
#
sub getUTF8 {
    my ($n, $sep) = @_;
    if (!defined $sep) { $sep = "%"; }
    my $utf8 = Encode::encode('utf8', chr($n));
    my $ux = ();
    for (my $i=0; $i<length($utf8); $i++) {
        $ux .= sprintf("%s%02x", $sep, ord(substr($utf8, $i, 1)));
    }
    return($ux);
}


###############################################################################
# Options
#
my $cp1252    = 0;
my $dnd       = 1;
my $iencoding = "";
my $long      = 0;
my $quiet     = 0;
my $utf8      = 0;
my $verbose   = 0;

my %getoptHash = (
    "cp1252!"           => \$cp1252,
    "dnd!"              => \$dnd,
    "h|help|?"          => sub { system "perldoc $0"; exit; },
    "iencoding=s"       => \$iencoding,
    "listEncodings"     => sub {
        warn "\nEncodings available:\n";
        my $last = ""; my $buf = "";
        for my $k (Encode->encodings(":all")) {
            my $cur = substr($k,0,2);
            if ($cur ne $last) {
                warn "$buf\n";
                $last = $cur; $buf = "";
            }
            $buf .= "$k ";
        }
        warn "$buf\n";
        exit;
    },
    "long!"             => \$long,
    "q|quiet!"          => \$quiet,
    "utf8!"             => \$utf8,
    "v|verbose+"        => \$verbose,
    "version"           => sub {
        die "Version of $VERSION_DATE, by Steven J. DeRose.\n";
    }
    );
Getopt::Long::Configure ("ignore_case");
GetOptions(%getoptHash) || die("Bad options.\n");

($ARGV[0]) ||
    die "Must have a non-negative numeric argument.\n";


###############################################################################
# Load up data about various character sets and encodings.
#
# These data hashes can be accessed via getCharInfo(hashRef, n, item).
my $c0Ref = getC0Data2();
my $c1Ref = getC1Data2();
my $cp1252Ref = getcp1252Data2();
my $macRef = getMacRomanData2();


###############################################################################
# MAIN
#
while (my $arg = shift) {
    if ($utf8) {                                      # UTF-8
        my $recoded = 0;
        if ($arg =~ m/^\W?(\d{3})\W?(\d{3})\W?(\d{3})$/) {
            $arg = sprintf('0x%02x%02x%02x', int($1), int($2), int($3));
            $recoded = 1;
        }
        if ($arg !~ m/^0?x([0-9a-f][0-9a-f])+$/i) {
            warn "Bad UTF8 value '$arg'. Must be given as 0x....\n";
            if ($recoded) { warn "    Or as 3 groups of 3 decimal digits.\n"; }
            next;
        }
        (my $hex = $arg) =~ s/^0?x//i;
        $hex =~ s/(..)/\\x$1/g;
        my $utfString = eval("\"$hex\"");
        my $str = decode("utf8", "$utfString");
        printf("utf-8 %s => %d Unicode character(s).\n", $arg, length($str));
        for (my $i=0; $i<length($str); $i++) {
            my $c = substr($str,$i,1);
            printf("  Character #%d (U+%04x):\n", ($i+1), ord($c));
            doOneChar(ord($c));
        }
    }
    elsif ($arg =~ m/^(\d+)$/) {                      # Decimal
        my $norm = $arg;
        $norm = $arg - 0;
        print "$arg:\n";
        doOneChar($norm);
    }
    elsif ($arg =~ m/^(0x[\da-f]+|\d+)$/i) {          # Hex
        my $h1 = hex(substr($arg,2,2));
        my $norm = $arg;
        $norm = oct($norm) if ($norm =~ m/^0/);
        print "$arg:\n";
        if ($norm > 255) { # Might be utf-8?

        }
        doOneChar($norm);
    }
    elsif (my $n = lookupAbbr($arg)) {                # Control
        pline("Control char:",  $arg);  # . ' ' . getBases($n));
        doOneChar($n);
    }
    elsif (length($arg) == 1) {
        print "'$arg' is not numeric -- if you meant 'ord', not 'chr':\n";
        my $n = ord($arg);
        print "    " . getBases($n) . " '$arg'\n";
    }
    else {
        print "'$arg' is not numeric, and not a control char mnemonic.\n";
    }
}

exit;


###############################################################################
#
sub doOneChar {
    my ($n, $u) = @_;
    $n = oct $n if ($n =~ m/^0/);
    if (!defined $u) { $u = chr($n); }

    showUnicodeInfo($n);
    showLiteral($n, $u);
    if (!$quiet) {
        pline("Bases:", getBases($n));
    }

    # URI escaping
    pline("URI form:", getUTF8($n));

    my $entName = HTML::Entities::encode_entities($u);
    if ($entName =~ m/^&#/) { $entName = "-NO HTML NAMED ENTITY-"; }
    my $xform = (isXmlChar($u)) ?
        sprintf("&#%d; &#x%x; %s\n", $n, $n, $entName) :
        "Not an XML character";
    if (!$quiet) { pline("XML forms:", $xform); }
} # doOneChar

# Display the literal character if it's printable. Otherwise, describe
# what it is, such as a control character, non legit Unicoe, etc.
#
sub showLiteral {
    my ($n, $u) = @_;
    if ($n < 32) {
        my $hex = sprintf("%02X",$n);
        ($verbose) && pline("C0 control character");
        if ($long) { pline("C0 control:",  getCharInfo($c0Ref, $n, 'Descr')); }
        else       { pline("C0 mnemonic:", getCharInfo($c0Ref, $n, 'Mnemonic')); }
    }
    elsif ($n == 32) {
        if ($long) { pline("SPACE:", getCharInfo($c0Ref, $n, 'Descr')); }
        else       { pline("SPACE:", getCharInfo($c0Ref, $n, 'Mnemonic')); }
    }
    elsif ($n < 128) {
        ($verbose) && pline("G0 graphic character");
        pline("G0 literal:", $u);
    }
    elsif ($n < 160) {
        ($verbose) && pline("C1 control character");
        if ($long) { pline("C1 control:",  getCharInfo($c1Ref, $n, 'Descr')); }
        else       { pline("C1 mnemonic:", getCharInfo($c1Ref, $n, 'Mnemonic')); }
        if ($cp1252 || !$quiet)  {
            print("*** DANGER: This is likely not Unicode but cp1252, in which case:\n");
            my $UEq = getCharInfo($cp1252Ref, $n, 'UEquiv');
            pline("  Unicode equiv", sprintf("U+%04x (d%d)",$UEq, $UEq));
            my $uname = charnames::viacode($UEq);
            pline("  Unicode name:",  $uname || "-NONE-");
        }
    }
    elsif ($n < 256) {
        ($verbose) && pline("G1 graphic character");
        pline("  G1 literal:", $u);
    }
}

sub showUnicodeInfo {
    my ($n) = @_;
    if (!isUnicodeCodePoint($n)) {
        pline("WARNING:", "Not a Unicode graphical code point");
        return;
    }

    pline("Unicode Name:", charnames::viacode($n) || "-NOT FOUND-");
    if ($quiet) { return; }

    pline("Unicode Script: ", charscript(sprintf("U+%04x", $n)));
    pline("Unicode Block:  ",  charblock(sprintf("U+%04x", $n)));
    my $pnum = $n >> 16;
    my $pname = "";
    if ($pnum>=0 && $pnum<=16) {
        $pname = $uPlanes[$pnum]->[0];
        if ($dnd) { $pname .= " (aka " . $uPlanes[$pnum]->[1] . ")"; }
    }
    else {
        $pname = "-UNKNOWN-";
    }
    pline("Unicode Plane:" , $pnum . ": " . $pname);

    if ($n == 0xEFBFBD) {
        pline("WARNING:", "UTF8 of U+FFFD (Replacement Character)?");
    }
} # showUnicodeInfo


# Find a code point given its short name; mainly for control characters.
# For example, "DC1" => x11.
sub lookupAbbr {
    my ($ab) = @_;
    for my $n (keys %{$c0Ref}) {
        my $mnem = getCharInfo($c0Ref, $n, 'Mnemonic');
        if ($mnem eq $ab) {
            return(hex("0x" . getCharInfo($c0Ref, $n, 'Hex')));
        }
    }
    return(undef);
}

sub isURIchar {
    my ($c) = @_;
    my $rc = 0;
    my $expr = "\$c =~ m/[-+A-Z_a-z0-9!\\\$&\'()*.\\\/:;=?\\\@]/";
    $rc = eval($expr);
    if ($@ ne "") {
        warn "isURIchar: Bad Unicode character '$c'?\n" .
            "  Eval('$expr') said:\n  $@\n";
        return(0);
    }
    return($rc);
}

sub getBases {
    my ($n) = @_;
    return(sprintf("o%04o d%04d x%04x; U+%04x, utf-8 %s",
                   $n, $n, $n, $n, getUTF8($n, "\\x")));
}


###############################################################################
# Maintain arrays of names for the C0 and C1 control characters
# (See chart at bottom)
#
sub getCharInfo {
    my ($hashRef, $n, $item) = @_;
    my @tuple = @{$hashRef->{$n}};
    if (!@tuple) { return(""); }
    if ($item eq 'Literal')     { return($tuple[0]); }
    elsif ($item eq 'Hex')      { return($tuple[1]); }
    elsif ($item eq 'UEquiv')   { return($tuple[2]); }
    elsif ($item eq 'Mnemonic') { return($tuple[3]); }  # !!
    elsif ($item eq 'Entname')  { return($tuple[3]); }  # !!
    elsif ($item eq 'Descr')    { return($tuple[4]); }
    else {
        die "Bad data item name '$item'.\n";
    }
}


###############################################################################
# Make a hash by hexcode, with an array of data items (see comment below).
# There are not HTML named character entities for these characters, so that
# column provides the usual mnemonic for the character instead.
#
sub getC0Data2 {
    my %c0d = (
    #       [ Literal    Hex  UEquiv Mnemonic Descr ]
    0x00 => [ chr(0x00), 0x00, 0x00, "NUL",  "Null" ],
    0x01 => [ chr(0x01), 0x01, 0x01, "SOH",  "Start Of Heading" ],
    0x02 => [ chr(0x02), 0x02, 0x02, "STX",  "Start Of Text" ],
    0x03 => [ chr(0x03), 0x03, 0x03, "ETX",  "End Of Text" ],
    0x04 => [ chr(0x04), 0x04, 0x04, "EOT",  "End Of Transmission" ],
    0x05 => [ chr(0x05), 0x05, 0x05, "ENQ",  "Enquiry" ],
    0x06 => [ chr(0x06), 0x06, 0x06, "ACK",  "Acknowledge" ],
    0x07 => [ chr(0x07), 0x07, 0x07, "BEL",  "Bell" ],

    0x08 => [ chr(0x08), 0x08, 0x08, "BS",   "Backspace" ],
    0x09 => [ chr(0x09), 0x09, 0x09, "HT",   "Horizontal Tab" ],
    0x0A => [ chr(0x0A), 0x0A, 0x0A, "LF",   "Newline" ],
    0x0B => [ chr(0x0B), 0x0B, 0x0B, "VT",   "Vertical Tab" ],
    0x0C => [ chr(0x0C), 0x0C, 0x0C, "FF",   "Form Feed" ],
    0x0D => [ chr(0x0D), 0x0D, 0x0D, "CR",   "Carriage Return" ],
    0x0E => [ chr(0x0E), 0x0E, 0x0E, "SO",   "Shift Out" ],
    0x0F => [ chr(0x0F), 0x0F, 0x0F, "SI",   "Shift In" ],

    0x10 => [ chr(0x10), 0x10, 0x10, "DLE",  "Data Link Escape" ],
    0x11 => [ chr(0x11), 0x11, 0x11, "DC1",  "Device Control 1" ],
    0x12 => [ chr(0x12), 0x12, 0x12, "DC2",  "Device Control 2" ],
    0x13 => [ chr(0x13), 0x13, 0x13, "DC3",  "Device Control 3" ],
    0x14 => [ chr(0x14), 0x14, 0x14, "DC4",  "Device Control 4" ],
    0x15 => [ chr(0x15), 0x15, 0x15, "NAK",  "Negative Acknowledge" ],
    0x16 => [ chr(0x16), 0x16, 0x16, "SYN",  "Synchronous Idle" ],
    0x17 => [ chr(0x17), 0x17, 0x17, "ETB",  "End Of Transmission Block" ],

    0x18 => [ chr(0x18), 0x18, 0x18, "CAN",  "Cancel" ],
    0x19 => [ chr(0x19), 0x19, 0x19, "EM",   "End Of Medium" ],
    0x1A => [ chr(0x1A), 0x1A, 0x1A, "SUB",  "Substitute" ],
    0x1B => [ chr(0x1B), 0x1B, 0x1B, "ESC",  "Escape" ],
    0x1C => [ chr(0x1C), 0x1C, 0x1C, "FS",   "Field Separator" ],
    0x1D => [ chr(0x1D), 0x1D, 0x1D, "GS",   "Group Separator" ],
    0x1E => [ chr(0x1E), 0x1E, 0x1E, "RS",   "Record Separator" ],
    0x1F => [ chr(0x1F), 0x1F, 0x1F, "US",   "Unit Separator" ],

    0x20 => [ chr(0x20), 0x20, 0x20, "SP",   "Space" ],
    );
    for (my $i=0x00; $i<=0x20; $i++) {  # Integrity check
        my @tuple = @{$c0d{$i}};
        if (!@tuple) {
            die sprintf("getC0Data2: No entry for char %02x.\n", $i);
        }
        if ($i!=ord($tuple[0]) || $i!=$tuple[1] || $i!=$tuple[2]) {
            die sprintf("getC0Data2: data conflict for char %02x.\n", $i);
        }
    }
    return(\%c0d);
} # getC0Data2


###############################################################################
# PAD, HOP, and SGCI are unassigned in Unicode (acc. Wikipedia),
# The C1 area is the difference between Latin-1 and cp1252.
# There are not HTML named character entities for these characters, so that
# column provides the usual mnemonic for the character instead.
#
sub getC1Data2 {
    my %c1d = (
    #       [ Literal    Hex  UEquiv Mnemonic  Descr
    0x80 => [ chr(0x80), 0x80, 0x80, "PAD?",   "Padding Character", ],
    0x81 => [ chr(0x81), 0x81, 0x81, "HOP?",   "High Octet Preset", ],
    0x82 => [ chr(0x82), 0x82, 0x82, "BPH",    "Break Permitted Here", ],
    0x83 => [ chr(0x83), 0x83, 0x83, "NBH",    "No Break Here", ],
    0x84 => [ chr(0x84), 0x84, 0x84, "IND",    "Index", ],
    0x85 => [ chr(0x85), 0x85, 0x85, "NEL",    "Next Line", ],
    0x86 => [ chr(0x86), 0x86, 0x86, "SSA",    "Start of Selected Area", ],
    0x87 => [ chr(0x87), 0x87, 0x87, "ESA",    "End of Selected Area", ],
    0x88 => [ chr(0x88), 0x88, 0x88, "HTS",    "Horizontal Tab Set", ],
    0x89 => [ chr(0x89), 0x89, 0x89, "HTJ",    "Horizontal Tab Justified", ],
    0x8A => [ chr(0x8A), 0x8A, 0x8A, "VTS",    "Vertical Tab Set", ],
    0x8B => [ chr(0x8B), 0x8B, 0x8B, "PLD",    "Partial Line Forward", ],
    0x8C => [ chr(0x8C), 0x8C, 0x8C, "PLU",    "Partial Line Backward", ],
    0x8D => [ chr(0x8D), 0x8D, 0x8D, "RI",     "Reverse Line Feed", ],
    0x8E => [ chr(0x8E), 0x8E, 0x8E, "SS2",    "Single-Shift 2", ],
    0x8F => [ chr(0x8F), 0x8F, 0x8F, "SS3",    "Single-Shift 3", ],
    0x90 => [ chr(0x90), 0x90, 0x90, "DCS",    "Device Control String", ],
    0x91 => [ chr(0x91), 0x91, 0x91, "PU1",    "Private Use 1", ],
    0x92 => [ chr(0x92), 0x92, 0x92, "PU2",    "Private Use 2", ],
    0x93 => [ chr(0x93), 0x93, 0x93, "STS",    "Set Transmit State", ],
    0x94 => [ chr(0x94), 0x94, 0x94, "CCH",    "Cancel character", ],
    0x95 => [ chr(0x95), 0x95, 0x95, "MW",     "Message Waiting", ],
    0x96 => [ chr(0x96), 0x96, 0x96, "SPA",    "Start of Protected Area", ],
    0x97 => [ chr(0x97), 0x97, 0x97, "EPA",    "End of Protected Area", ],
    0x98 => [ chr(0x98), 0x98, 0x98, "SOS",    "Start of String", ],
    0x99 => [ chr(0x99), 0x99, 0x99, "SGCI?",  "Single Graphic Char Intro", ],
    0x9A => [ chr(0x9A), 0x9A, 0x9A, "SCI",    "Single Char Intro", ],
    0x9B => [ chr(0x9B), 0x9B, 0x9B, "CSI",    "Control Sequence Introducer", ],
    0x9C => [ chr(0x9C), 0x9C, 0x9C, "ST",     "String Terminator", ],
    0x9D => [ chr(0x9D), 0x9D, 0x9D, "OSC",    "OS Command", ],
    0x9E => [ chr(0x9E), 0x9E, 0x9E, "*PVM",   "Privacy Message", ],      ### ??? FIX
    0x9F => [ chr(0x9F), 0x9F, 0x9F, "APC",    "Padding Character", ],
    0xA0 => [ chr(0xA0), 0xA0, 0xA0, "NBS",    "NO-BREAK SPACE", ],
    );
    for (my $i=0x80; $i<=0xA0; $i++) {  # Integrity check
        my @tuple = @{$c1d{$i}};
        if (!@tuple) {
            die sprintf("getC1Data2: No entry for char %02x.\n", $i);
        }
        if ($i!=ord($tuple[0]) || $i!=$tuple[1] || $i!=$tuple[2]) {
            die sprintf("getC1Data2: data conflict for char %02x.\n", $i);
        }
    }
    return(\%c1d);
} # getC1Data


###############################################################################
# PAD, HOP, and SGCI are unassigned in Unicode (acc. Wikipedia).
# This area is the difference between Latin-1 and cp1252.
#
sub getcp1252Data2 {
    my %cp1252d = (
    #       [ Literal    Hex   UEquiv  EntName      Descr
    0x80 => [ chr(0x80), 0x80, 0x20AC, "Euro",      '', ],
    0x81 => [ chr(0x81), 0x81, -1,     "",          '', ],
    0x82 => [ chr(0x82), 0x82, 0x201A, "LowSQuo",   '', ],
    0x83 => [ chr(0x83), 0x83, 0x0192, "Florin",    '', ],
    0x84 => [ chr(0x84), 0x84, 0x201E, "LowDQuo",   '', ],
    0x85 => [ chr(0x85), 0x85, 0x2026, "hellip",    '', ],
    0x86 => [ chr(0x86), 0x86, 0x2020, "dagger",    '', ],
    0x87 => [ chr(0x87), 0x87, 0x2021, "Dagger",    '', ],
    0x88 => [ chr(0x88), 0x88, 0x02C6, "Cflex",     '', ],
    0x89 => [ chr(0x89), 0x89, 0x2030, "PerMil",    '', ],
    0x8A => [ chr(0x8A), 0x8A, 0x0160, "SCaron",    '', ],
    0x8B => [ chr(0x8B), 0x8B, 0x2039, "LAQuo",     '', ],
    0x8C => [ chr(0x8C), 0x8C, 0x0152, "OElig",     '', ],
    0x8D => [ chr(0x8D), 0x8D, -1,     "",          '', ],
    0x8E => [ chr(0x8E), 0x8E, 0x017D, "ZCaron",    '', ],
    0x8F => [ chr(0x8F), 0x8F, -1,     "",          '', ],
    0x90 => [ chr(0x90), 0x90, -1,     "",          '', ],
    0x91 => [ chr(0x91), 0x91, 0x2018, "LSQuo",     '', ],
    0x92 => [ chr(0x92), 0x92, 0x2019, "RSQuo",     '', ],
    0x93 => [ chr(0x93), 0x93, 0x201C, "LDQuo",     '', ],
    0x94 => [ chr(0x94), 0x94, 0x201D, "RDQuo",     '', ],
    0x95 => [ chr(0x95), 0x95, 0x2022, "Bull",      '', ],
    0x96 => [ chr(0x96), 0x96, 0x2013, "enDash",    '', ],
    0x97 => [ chr(0x97), 0x97, 0x2014, "emDash",    '', ],
    0x98 => [ chr(0x98), 0x98, 0x02DC, "Tilde",     '', ],
    0x99 => [ chr(0x99), 0x99, 0x2122, "Trade",     '', ],
    0x9A => [ chr(0x9A), 0x9A, 0x0161, "sCaron",    '', ],
    0x9B => [ chr(0x9B), 0x9B, 0x203A, "RAQuo",     '', ],
    0x9C => [ chr(0x9C), 0x9C, 0x0153, "oelig",     '', ],
    0x9D => [ chr(0x9D), 0x9D, -1,     "",          '', ],
    0x80 => [ chr(0x80), 0x80, 0x017E, "zCaron",    '', ],
    0x9F => [ chr(0x9F), 0x9F, 0x20AC, "Euro",      '', ],
    0xA0 => [ chr(0xA0), 0xA0, 0x00A0, "nbsp",      '', ],
    );
    return(\%cp1252d);
} # getcp1252Data2


###############################################################################
# See http://en.wikipedia.org/wiki/Mac_OS_Roman
#
# Perl form for info on old MacRoman character set.
# See also other forms such as Python and XSV data.
# From 'chr' by Steven J. DeRose, as of 2015-09-15.
#
# Fields are:
#     MacRoman code point as literal
#     MacRoman hex code point
#     Corresponding Unicode hex code point
#     HTML entity name
#     Unicode character name (in mixed case)
#

sub getMacRomanData2 {
    my $uc = 'Latin capital letter ';
    my $lc = 'Latin small letter ';
    my $qm = ' quotation mark';
    my %mac = (
    #       [ Lit        Hex   UEquiv  EntName   Descr
    ####### C1 #######
    0x80 => [ chr(0x80), 0x80, 0x00C4, "Auml",   $uc.'A with diaeresis', ],
    0x81 => [ chr(0x81), 0x81, 0x00C5, "Aring",  $uc.'A with ring above', ],
    0x82 => [ chr(0x82), 0x82, 0x00C7, "Ccedil", $uc.'C with cedilla', ],
    0x83 => [ chr(0x83), 0x83, 0x00C9, "Eacute", $uc.'E with acute', ],
    0x84 => [ chr(0x84), 0x84, 0x00D1, "Ntilde", $uc.'N with tilde', ],
    0x85 => [ chr(0x85), 0x85, 0x00D6, "Ouml",   $uc.'O with diaeresis', ],
    0x86 => [ chr(0x86), 0x86, 0x00DC, "Uuml",   $uc.'U with diaeresis', ],
    0x87 => [ chr(0x87), 0x87, 0x00E1, "aacute", $lc.'a with acute', ],
    0x88 => [ chr(0x88), 0x88, 0x00E0, "agrave", $lc.'a with grave', ],
    0x89 => [ chr(0x89), 0x89, 0x00E2, "acirc",  $lc.'a with circumflex', ],
    0x8A => [ chr(0x8A), 0x8A, 0x00E4, "auml",   $lc.'a with diaeresis', ],
    0x8B => [ chr(0x8B), 0x8B, 0x00E3, "atilde", $lc.'a with tilde', ],
    0x8C => [ chr(0x8C), 0x8C, 0x00E5, "aring",  $lc.'a with ring above', ],
    0x8D => [ chr(0x8D), 0x8D, 0x00E7, "ccedil", $lc.'c with cedilla', ],
    0x8E => [ chr(0x8E), 0x8E, 0x00E9, "eacute", $lc.'e with acute', ],
    0x8F => [ chr(0x8F), 0x8F, 0x00E8, "egrave", $lc.'e with grave', ],
    0x90 => [ chr(0x90), 0x90, 0x00EA, "ecirc",  $lc.'e with circumflex', ],
    0x91 => [ chr(0x91), 0x91, 0x00EB, "euml",   $lc.'e with diaeresis', ],
    0x92 => [ chr(0x92), 0x92, 0x00ED, "iacute", $lc.'i with acute', ],
    0x93 => [ chr(0x93), 0x93, 0x00EC, "igrave", $lc.'i with grave', ],
    0x94 => [ chr(0x94), 0x94, 0x00EE, "icirc",  $lc.'i with circumflex', ],
    0x95 => [ chr(0x95), 0x95, 0x00EF, "iuml",   $lc.'i with diaeresis', ],
    0x96 => [ chr(0x96), 0x96, 0x00F1, "ntilde", $lc.'n with tilde', ],
    0x97 => [ chr(0x97), 0x97, 0x00F3, "oacute", $lc.'o with acute', ],
    0x98 => [ chr(0x98), 0x98, 0x00F2, "ograve", $lc.'o with grave', ],
    0x99 => [ chr(0x99), 0x99, 0x00F4, "ocirc",  $lc.'o with circumflex', ],
    0x9A => [ chr(0x9A), 0x9A, 0x00F6, "ouml",   $lc.'o with diaeresis', ],
    0x9B => [ chr(0x9B), 0x9B, 0x00F5, "otilde", $lc.'o with tilde', ],
    0x9C => [ chr(0x9C), 0x9C, 0x00FA, "uacute", $lc.'u with acute', ],
    0x9D => [ chr(0x9D), 0x9D, 0x00F9, "ugrave", $lc.'u with grave', ],
    0x9E => [ chr(0x9E), 0x9E, 0x00FB, "ucirc",  $lc.'u with circumflex', ],
    0x9F => [ chr(0x9F), 0x9F, 0x00FC, "uuml",   $lc.'u with diaeresis', ],
    ####### G1 #######
    0xA0 => [ chr(0xA0), 0xA0, 0x2020, "dagger", 'Dagger', ],
    0xA1 => [ chr(0xA1), 0xA1, 0x00B0, "deg",    'Degree sign', ],
    0xA2 => [ chr(0xA2), 0xA2, 0x00A2, "cent",   'Cent sign', ],
    0xA3 => [ chr(0xA3), 0xA3, 0x00A3, "pound",  'Pound sign', ],
    0xA4 => [ chr(0xA4), 0xA4, 0x00A7, "sect",   'Section sign', ],
    0xA5 => [ chr(0xA5), 0xA5, 0x2022, "bull",   'Bullet', ],
    0xA6 => [ chr(0xA6), 0xA6, 0x00B6, "para",   'Pilcrow sign', ],
    0xA7 => [ chr(0xA7), 0xA7, 0x00DF, "szlig",  $lc.'sharp s', ],
    0xA8 => [ chr(0xA8), 0xA8, 0x00AE, "reg",    'Registered sign', ],
    0xA9 => [ chr(0xA9), 0xA9, 0x00A9, "copy",   'Copyright sign', ],
    0xAA => [ chr(0xAA), 0xAA, 0x2122, "trade",  'Trade mark sign', ],
    0xAB => [ chr(0xAB), 0xAB, 0x00B4, "acute",  'Acute accent', ],
    0xAC => [ chr(0xAC), 0xAC, 0x00A8, "uml",    'Diaeresis', ],
    0xAD => [ chr(0xAD), 0xAD, 0x2260, "ne",     'Not equal to', ],
    0xAE => [ chr(0xAE), 0xAE, 0x00C6, "AElig",  $uc.'AE', ],
    0xAF => [ chr(0xAF), 0xAF, 0x00D8, "Oslash", $uc.'O with stroke', ],
    0xB0 => [ chr(0xB0), 0xB0, 0x221E, "infin",  'Infinity', ],
    0xB1 => [ chr(0xB1), 0xB1, 0x00B1, "plusmn", 'Plus-minus sign', ],
    0xB2 => [ chr(0xB2), 0xB2, 0x2264, "le",     'Less-than or equal to', ],
    0xB3 => [ chr(0xB3), 0xB3, 0x2265, "ge",     'Greater-than or equal to', ],
    0xB4 => [ chr(0xB4), 0xB4, 0x00A5, "yen",    'Yen sign', ],
    0xB5 => [ chr(0xB5), 0xB5, 0x00B5, "micro",  'Micro sign', ],
    0xB6 => [ chr(0xB6), 0xB6, 0x2202, "part",   'Partial differential', ],
    0xB7 => [ chr(0xB7), 0xB7, 0x2211, "sum",    'N-ary summation', ],
    0xB8 => [ chr(0xB8), 0xB8, 0x220F, "prod",   'N-ary product', ],
    0xB9 => [ chr(0xB9), 0xB9, 0x03C0, "pi",     'Greek small letter pi', ],
    0xBA => [ chr(0xBA), 0xBA, 0x222B, "int",    'Integral', ],
    0xBB => [ chr(0xBB), 0xBB, 0x00AA, "ordf",   'Feminine ordinal indicator', ],
    0xBC => [ chr(0xBC), 0xBC, 0x00BA, "ordm",   'Masculine ordinal indicator', ],
    0xBD => [ chr(0xBD), 0xBD, 0x03A9, "Omega",  'Greek capital letter Omega', ],
    0xBE => [ chr(0xBE), 0xBE, 0x00E6, "aelig",  $lc.'ae', ],
    0xBF => [ chr(0xBF), 0xBF, 0x00F8, "oslash", $lc.'o with stroke', ],
    0xC0 => [ chr(0xC0), 0xC0, 0x00BF, "iquest", 'Inverted question mark', ],
    0xC1 => [ chr(0xC1), 0xC1, 0x00A1, "iexcl",  'Inverted exclamation mark', ],
    0xC2 => [ chr(0xC2), 0xC2, 0x00AC, "not",    'Not sign', ],
    0xC3 => [ chr(0xC3), 0xC3, 0x221A, "radic",  'Square root', ],
    0xC4 => [ chr(0xC4), 0xC4, 0x0192, "fnof",   $lc.'f with hook', ],
    0xC5 => [ chr(0xC5), 0xC5, 0x2248, "asymp",  'Almost equal to', ],
    0xC6 => [ chr(0xC6), 0xC6, 0x2206, "",       'Increment', ],
    0xC7 => [ chr(0xC7), 0xC7, 0x00AB, "laquo",  'Left-pointing double angle'.$qm, ],
    0xC8 => [ chr(0xC8), 0xC8, 0x00BB, "raquo",  'Right-pointing double angle'.$qm, ],
    0xC9 => [ chr(0xC9), 0xC9, 0x2026, "hellip", 'Horizontal ellipsis', ],
    0xCA => [ chr(0xCA), 0xCA, 0x00A0, "nbsp",   'No-break space', ],
    0xCB => [ chr(0xCB), 0xCB, 0x00C0, "Agrave", $uc.'A with grave', ],
    0xCC => [ chr(0xCC), 0xCC, 0x00C3, "Atilde", $uc.'A with tilde', ],
    0xCD => [ chr(0xCD), 0xCD, 0x00D5, "Otilde", $uc.'O with tilde', ],
    0xCE => [ chr(0xCE), 0xCE, 0x0152, "OElig",  'Latin capital ligature OE', ],
    0xCF => [ chr(0xCF), 0xCF, 0x0153, "oelig",  'Latin small ligature oe', ],
    0xD0 => [ chr(0xD0), 0xD0, 0x2013, "ndash",  'En dash', ],
    0xD1 => [ chr(0xD1), 0xD1, 0x2014, "mdash",  'Em dash', ],
    0xD2 => [ chr(0xD2), 0xD2, 0x201C, "ldquo",  'Left double'.$qm, ],
    0xD3 => [ chr(0xD3), 0xD3, 0x201D, "rdquo",  'Right double'.$qm, ],
    0xD4 => [ chr(0xD4), 0xD4, 0x2018, "lsquo",  'Left single'.$qm, ],
    0xD5 => [ chr(0xD5), 0xD5, 0x2019, "rsquo",  'Right single'.$qm, ],
    0xD6 => [ chr(0xD6), 0xD6, 0x00F7, "divide", 'Division sign', ],
    0xD7 => [ chr(0xD7), 0xD7, 0x25CA, "loz",    'Lozenge', ],
    0xD8 => [ chr(0xD8), 0xD8, 0x00FF, "yuml",   $lc.'y with diaeresis', ],
    0xD9 => [ chr(0xD9), 0xD9, 0x0178, "Yuml",   $uc.'Y with diaeresis', ],
    0xDA => [ chr(0xDA), 0xDA, 0x2044, "frasl",  'Fraction slash', ],
    0xDB => [ chr(0xDB), 0xDB, 0x20AC, "euro",   'Euro sign', ],
    0xDC => [ chr(0xDC), 0xDC, 0x2039, "lsaquo", 'Single left-pointing angle'.$qm, ],
    0xDD => [ chr(0xDD), 0xDD, 0x203A, "rsaquo", 'Single right-pointing angle'.$qm, ],
    0xDE => [ chr(0xDE), 0xDE, 0xFB01, "",       'Latin small ligature fi', ],
    0xDF => [ chr(0xDF), 0xDF, 0xFB02, "",       'Latin small ligature fl', ],
    0xE0 => [ chr(0xE0), 0xE0, 0x2021, "Dagger", 'Double dagger', ],
    0xE1 => [ chr(0xE1), 0xE1, 0x00B7, "middot", 'Middle dot', ],
    0xE2 => [ chr(0xE2), 0xE2, 0x201A, "sbquo",  'Single low-9'.$qm, ],
    0xE3 => [ chr(0xE3), 0xE3, 0x201E, "bdquo",  'Double low-9'.$qm, ],
    0xE4 => [ chr(0xE4), 0xE4, 0x2030, "permil", 'Per mille sign', ],
    0xE5 => [ chr(0xE5), 0xE5, 0x00C2, "Acirc",  $uc.'A with circumflex', ],
    0xE6 => [ chr(0xE6), 0xE6, 0x00CA, "Ecirc",  $uc.'E with circumflex', ],
    0xE7 => [ chr(0xE7), 0xE7, 0x00C1, "Aacute", $uc.'A with acute', ],
    0xE8 => [ chr(0xE8), 0xE8, 0x00CB, "Euml",   $uc.'E with diaeresis', ],
    0xE9 => [ chr(0xE9), 0xE9, 0x00C8, "Egrave", $uc.'E with grave', ],
    0xEA => [ chr(0xEA), 0xEA, 0x00CD, "Iacute", $uc.'I with acute', ],
    0xEB => [ chr(0xEB), 0xEB, 0x00CE, "Icirc",  $uc.'I with circumflex', ],
    0xEC => [ chr(0xEC), 0xEC, 0x00CF, "Iuml",   $uc.'I with diaeresis', ],
    0xED => [ chr(0xED), 0xED, 0x00CC, "Igrave", $uc.'I with grave', ],
    0xEE => [ chr(0xEE), 0xEE, 0x00D3, "Oacute", $uc.'O with acute', ],
    0xEF => [ chr(0xEF), 0xEF, 0x00D4, "Ocirc",  $uc.'O with circumflex', ],
    0xF0 => [ chr(0xF0), 0xF0, 0xF8FF, "",       'Apple logo', ],
    0xF1 => [ chr(0xF1), 0xF1, 0x00D2, "Ograve", $uc.'O with grave', ],
    0xF2 => [ chr(0xF2), 0xF2, 0x00DA, "Uacute", $uc.'U with acute', ],
    0xF3 => [ chr(0xF3), 0xF3, 0x00DB, "Ucirc",  $uc.'U with circumflex', ],
    0xF4 => [ chr(0xF4), 0xF4, 0x00D9, "Ugrave", $uc.'U with grave', ],
    0xF5 => [ chr(0xF5), 0xF5, 0x0131, "",       $lc.'dotless i', ],
    0xF6 => [ chr(0xF6), 0xF6, 0x02C6, "circ",   'Modifier letter circumflex accent', ],
    0xF7 => [ chr(0xF7), 0xF7, 0x02DC, "tilde",  'Small tilde', ],
    0xF8 => [ chr(0xF8), 0xF8, 0x00AF, "macr",   'Macron', ],
    0xF9 => [ chr(0xF9), 0xF9, 0x02D8, "",       'Breve', ],
    0xFA => [ chr(0xFA), 0xFA, 0x02D9, "",       'Dot above', ],
    0xFB => [ chr(0xFB), 0xFB, 0x02DA, "",       'Ring above', ],
    0xFC => [ chr(0xFC), 0xFC, 0x00B8, "cedil",  'Cedilla', ],
    0xFD => [ chr(0xFD), 0xFD, 0x02DD, "",       'Double acute accent', ],
    0xFE => [ chr(0xFE), 0xFE, 0x02DB, "",       'Ogonek', ],
    0xFF => [ chr(0xFF), 0xFF, 0x02C7, "",       'Caron', ],
    );
    return(\%mac);
} # macRomanData2