-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchr
executable file
·845 lines (744 loc) · 34.3 KB
/
chr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
#!/usr/bin/env perl -w
#
# chr: Show char for a given code point number.
# 2007-10-29: Written by Steven J. DeRose.
#
use strict;
use Getopt::Long;
use Encode;
use charnames ':full';
use Unicode::UCD 'charscript';
use Unicode::UCD 'charblock';
use HTML::Entities;
#use Encode::Escape; #::Unicode;
use alogging;
our %metadata = (
'title' => "chr",
'description' => "Retrieve lots of data on Unicode characters.",
'rightsHolder' => "Steven J. DeRose",
'creator' => "http://viaf.org/viaf/50334488",
'type' => "http://purl.org/dc/dcmitype/Software",
'language' => "Perl 5.18",
'created' => "2007-10-29",
'modified' => "2024-02-26",
'publisher' => "http://github.com/sderose",
'license' => "https://creativecommons.org/licenses/by-sa/3.0/"
);
our $VERSION_DATE = $metadata{'modified'};
=pod
=head1 Usage
chr [options] [nums]
Display information about the character(s)
corresponding to the code point(s) number(s) in I<nums>.
For example, "chr 0x2203" produces:
0x2203:
Bases: o21003 d8707 x2203; U+2203, utf-8 \xe2\x88\x83
Unicode Name: THERE EXISTS
Unicode Script: Common
Unicode Block: Mathematical Operators
Unicode Plane: 0: Basic Multilingual
URI form: %e2%88%83
XML forms: ∃ ∃ ∃
I<nums> may be in hex (0x...), octal (0...), binary (0b...), or decimal.
With the I<--utf8> option, you can give I<nums> as hex UTF8.
Control characters and spaces will be displayed as mnemonics.
Tries to get the full Unicode char name for chars >255.
Display of other characters depends on your terminal program.
=head1 Options
(prefix 'no' to invert when applicable)
=over
=item * B<--cp1252>
Show Windows Code Page 1252 meanings of characters d128-d159 (the rest
of CP1252 matches Latin-1).
=item * B<--iencoding> I<e>
Assume the output is in character set I<e>.
Not yet supported.
See also I<--listEncodings>, I<--cp1252>.
=item * B<--listEncodings>
Show all the encodings supported by I<--iencoding>, and exit.
=item * B<--long>
Give long names for control characters, instead of mnemonics.
=item * B<--quiet> OR B<-q>
Suppress most messages.
=item * B<--utf8>
Interpret the command-line numbers as representations of UTF8 (the
most common representation for Unicode).
The input tokens can be in hexadecimal or decimal. For example,
the C<Left Double Quotation Mark> character is Unicode code point
U+201C (or o20034, d8220, x201c). In UTF8, all characters > 127 are encoded
as multiple bytes, in this case the 3 byte sequence \xe2, \x80, \x9c. To
use C<chr> to identify this sequence, do:
chr --utf8 0xe2809c
chr --utf8 226.128.153
With decimal, there must be exactly 9 digits, optionally separated into
3 groups of 3 using non-word-characters (to use space, quote the argument).
This limits the range of Unicode supported for decimal.
You can enter a UTF hex sequence that represents more than one character.
C<chr> will find the boundaries, and describe each character in turn.
However, C<chr> cannot do the same for decimal representations of more than
1 character together.
=item * B<--verbose> OR B<-v>
Add more detailed messages.
=item * B<--version>
Display version info and exit.
=back
=head1 Known bugs and limitations
Some terminal programs assume Latin-1, ASCII, or even CP1252,
while Perl most readily writes out Unicode (as utf8).
So output of literal characters outside the ASCII range,
going to a non-Unicode-supporting terminal, may look funny.
=head1 Related commands
C<CharDisplay> Python utility to just print a ton of data about a given
code point.
C<ord> -- Do the reverse (that is, find the numeric code point(s)
for given character(s).
C<countChars> -- Find and/or count characters in particular ranges,
including XML character references, URI escapes, etc.
C<charnames> -- CPAN package to deal with Unicode properties and names.
C<showNumberInBases> -- takes numbers in any of several forms, and shows them
in multiple forms, much like this script also does with I<--nums>.
C<XmlTuples.pm> -- Parse various internal data about characters (optional).
=head1 History
2007-10-29: Written by Steven J. DeRose.
2008-01-02 sjd: Add -n. Fix name access for C1 range.
2008-09-03 sjd: Move to BSD.
2010-01-06 sjd: Add charnames for Unicode.
2010-05-03 sjd: perldoc. Start fixing base recognition.
2011-06-22 sjd: Fix bug handling unknown Unicode char names.
2011-06-29 sjd: Eliminate -multiple option. Fix decimal input.
2011-08-04 sjd: Support -cp1252. Fix oct() usage. Check Unicode max.
2012-01-27 sjd: Keep shifting data to use XmlTuples.
2012-02-28 sjd: Last of XmlTuples integration. Recognize C0 abbrs.
2012-07-27 sjd: Trap bad unicode char in isURIchar().
2012-08-13f sjd: Better message if arg isn't numeric. Do URI form.
Add HTML::Entities. Clean up display.
2013-01-14 sjd: Add Unicode script and block.
2013-06-19: Add Unicode equivalents for CP1252 chars.
2013-08-19: Add utf-8 input:
e28098 = u+2018 = lsquo
e28099 = u+2019 = rsquo
e2809c = u+201C = ldquo
e2809d = u+201D = rdquo
2015-07-07: Allow decimal "utf8" input: \226\128\153, etc.
2015-09-15: Make -q useful. Make table of plane names.
2024-02-26: Drop XSV data on MacRoman etc. New layout.
=head1 To do
Finish losing XmlTuples, c0longNames, etc.
Accept HTML named character entities.
Finish --iencoding, --mac.
Support input of UTF-8 byte sequences?
Option to display all Unicode properties?
Integrate with 'ord'?
Accept ^A...^Z, ^@.
=head1 Ownership
This work by Steven J. DeRose is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see L<http://creativecommons.org/licenses/by-sa/3.0/>.
For the most recent version, see L<http://www.derose.net/steve/utilities/>.
=cut
# Names of the Unicode Planes.
my @uPlanes = ();
$uPlanes[ 0] = [ "Basic Multilingual", "Prime Material" ];
$uPlanes[ 1] = [ "Supplementary Multilingual", "Celestia" ];
$uPlanes[ 2] = [ "Supplementary Ideographic", "Bytopia" ];
$uPlanes[ 3] = [ "Unassigned 3", "Elysium" ];
$uPlanes[ 4] = [ "Unassigned 4", "Beastlands" ];
$uPlanes[ 5] = [ "Unassigned 5", "Arborea" ];
$uPlanes[ 6] = [ "Unassigned 6", "Ysgard" ];
$uPlanes[ 7] = [ "Unassigned 7", "Limbo" ];
$uPlanes[ 8] = [ "Unassigned 8", "Pandemonium" ];
$uPlanes[ 9] = [ "Unassigned 9", "Abyss" ];
$uPlanes[10] = [ "Unassigned 10", "Carceri" ];
$uPlanes[11] = [ "Unassigned 11", "Hades" ];
$uPlanes[12] = [ "Unassigned 12", "Gehenna" ];
$uPlanes[13] = [ "Unassigned 13", "Baator" ];
$uPlanes[14] = [ "Supplementary Special-purpose", "Acheron" ];
$uPlanes[15] = [ "Supplementary Private Use Area A", "Mechanus" ];
$uPlanes[16] = [ "Supplementary Private Use Area B", "Arcadia" ];
###############################################################################
# These are from sjdUtils.pm.
#
# Plane 0 0x000000 to 0x00FFFF BMP
# Plane 1 0x010000 to 0x01FFFF Suppl MP
# Plane 2 0x020000 to 0x02FFFF Supple Ideographic
# Plane 3-13 0x030000 to 0x0DFFFF Unassigned
# Plane 14 0x0E0000 to 0x0EFFFF Suppl Special-purpose
# Plane 15-16 0x0F0000 to 0x10FFFF Suppl Private Use Area
#
sub isUnicodeCodePoint {
my ($n) = @_;
if ($n < 0x000000 || $n > 0x10FFFF ||
$n == 0x00FFFE || $n == 0x00FFFF ||
($n >= 0x000080 && $n < 0x0000a0)
) { return(0); }
return(1);
}
my $xmlCharExpr =
"\t\n\r\x20-\x{D7FF}\x{E000}-\x{FFFD}";
#"\x{00010000}-\x{0010FFFF}";
my $xce = qr/^[$xmlCharExpr]$/;
sub isXmlChar { # XML REC, production 2
my ($c) = @_;
return(($c =~ m/$xce/) ? 1:0);
}
sub Xtry_module {
my ($mod, $quiet) = @_;
eval("use $mod");
if ($@) {
($quiet) || warn
"try_module: Couldn't find Perl module '$mod'\n";
return(0);
}
return(1);
}
sub pline {
my ($label, $data) = @_;
printf(" %-16s %s\n", $label, $data || "");
}
# Return the UTF-8 byte sequences for a given character code,
# punctuated as needed to put in a URI.
#
sub getUTF8 {
my ($n, $sep) = @_;
if (!defined $sep) { $sep = "%"; }
my $utf8 = Encode::encode('utf8', chr($n));
my $ux = ();
for (my $i=0; $i<length($utf8); $i++) {
$ux .= sprintf("%s%02x", $sep, ord(substr($utf8, $i, 1)));
}
return($ux);
}
###############################################################################
# Options
#
my $cp1252 = 0;
my $dnd = 1;
my $iencoding = "";
my $long = 0;
my $quiet = 0;
my $utf8 = 0;
my $verbose = 0;
my %getoptHash = (
"cp1252!" => \$cp1252,
"dnd!" => \$dnd,
"h|help|?" => sub { system "perldoc $0"; exit; },
"iencoding=s" => \$iencoding,
"listEncodings" => sub {
warn "\nEncodings available:\n";
my $last = ""; my $buf = "";
for my $k (Encode->encodings(":all")) {
my $cur = substr($k,0,2);
if ($cur ne $last) {
warn "$buf\n";
$last = $cur; $buf = "";
}
$buf .= "$k ";
}
warn "$buf\n";
exit;
},
"long!" => \$long,
"q|quiet!" => \$quiet,
"utf8!" => \$utf8,
"v|verbose+" => \$verbose,
"version" => sub {
die "Version of $VERSION_DATE, by Steven J. DeRose.\n";
}
);
Getopt::Long::Configure ("ignore_case");
GetOptions(%getoptHash) || die("Bad options.\n");
($ARGV[0]) ||
die "Must have a non-negative numeric argument.\n";
###############################################################################
# Load up data about various character sets and encodings.
#
# These data hashes can be accessed via getCharInfo(hashRef, n, item).
my $c0Ref = getC0Data2();
my $c1Ref = getC1Data2();
my $cp1252Ref = getcp1252Data2();
my $macRef = getMacRomanData2();
###############################################################################
# MAIN
#
while (my $arg = shift) {
if ($utf8) { # UTF-8
my $recoded = 0;
if ($arg =~ m/^\W?(\d{3})\W?(\d{3})\W?(\d{3})$/) {
$arg = sprintf('0x%02x%02x%02x', int($1), int($2), int($3));
$recoded = 1;
}
if ($arg !~ m/^0?x([0-9a-f][0-9a-f])+$/i) {
warn "Bad UTF8 value '$arg'. Must be given as 0x....\n";
if ($recoded) { warn " Or as 3 groups of 3 decimal digits.\n"; }
next;
}
(my $hex = $arg) =~ s/^0?x//i;
$hex =~ s/(..)/\\x$1/g;
my $utfString = eval("\"$hex\"");
my $str = decode("utf8", "$utfString");
printf("utf-8 %s => %d Unicode character(s).\n", $arg, length($str));
for (my $i=0; $i<length($str); $i++) {
my $c = substr($str,$i,1);
printf(" Character #%d (U+%04x):\n", ($i+1), ord($c));
doOneChar(ord($c));
}
}
elsif ($arg =~ m/^(\d+)$/) { # Decimal
my $norm = $arg;
$norm = $arg - 0;
print "$arg:\n";
doOneChar($norm);
}
elsif ($arg =~ m/^(0x[\da-f]+|\d+)$/i) { # Hex
my $h1 = hex(substr($arg,2,2));
my $norm = $arg;
$norm = oct($norm) if ($norm =~ m/^0/);
print "$arg:\n";
if ($norm > 255) { # Might be utf-8?
}
doOneChar($norm);
}
elsif (my $n = lookupAbbr($arg)) { # Control
pline("Control char:", $arg); # . ' ' . getBases($n));
doOneChar($n);
}
elsif (length($arg) == 1) {
print "'$arg' is not numeric -- if you meant 'ord', not 'chr':\n";
my $n = ord($arg);
print " " . getBases($n) . " '$arg'\n";
}
else {
print "'$arg' is not numeric, and not a control char mnemonic.\n";
}
}
exit;
###############################################################################
#
sub doOneChar {
my ($n, $u) = @_;
$n = oct $n if ($n =~ m/^0/);
if (!defined $u) { $u = chr($n); }
showUnicodeInfo($n);
showLiteral($n, $u);
if (!$quiet) {
pline("Bases:", getBases($n));
}
# URI escaping
pline("URI form:", getUTF8($n));
my $entName = HTML::Entities::encode_entities($u);
if ($entName =~ m/^&#/) { $entName = "-NO HTML NAMED ENTITY-"; }
my $xform = (isXmlChar($u)) ?
sprintf("&#%d; &#x%x; %s\n", $n, $n, $entName) :
"Not an XML character";
if (!$quiet) { pline("XML forms:", $xform); }
} # doOneChar
# Display the literal character if it's printable. Otherwise, describe
# what it is, such as a control character, non legit Unicoe, etc.
#
sub showLiteral {
my ($n, $u) = @_;
if ($n < 32) {
my $hex = sprintf("%02X",$n);
($verbose) && pline("C0 control character");
if ($long) { pline("C0 control:", getCharInfo($c0Ref, $n, 'Descr')); }
else { pline("C0 mnemonic:", getCharInfo($c0Ref, $n, 'Mnemonic')); }
}
elsif ($n == 32) {
if ($long) { pline("SPACE:", getCharInfo($c0Ref, $n, 'Descr')); }
else { pline("SPACE:", getCharInfo($c0Ref, $n, 'Mnemonic')); }
}
elsif ($n < 128) {
($verbose) && pline("G0 graphic character");
pline("G0 literal:", $u);
}
elsif ($n < 160) {
($verbose) && pline("C1 control character");
if ($long) { pline("C1 control:", getCharInfo($c1Ref, $n, 'Descr')); }
else { pline("C1 mnemonic:", getCharInfo($c1Ref, $n, 'Mnemonic')); }
if ($cp1252 || !$quiet) {
print("*** DANGER: This is likely not Unicode but cp1252, in which case:\n");
my $UEq = getCharInfo($cp1252Ref, $n, 'UEquiv');
pline(" Unicode equiv", sprintf("U+%04x (d%d)",$UEq, $UEq));
my $uname = charnames::viacode($UEq);
pline(" Unicode name:", $uname || "-NONE-");
}
}
elsif ($n < 256) {
($verbose) && pline("G1 graphic character");
pline(" G1 literal:", $u);
}
}
sub showUnicodeInfo {
my ($n) = @_;
if (!isUnicodeCodePoint($n)) {
pline("WARNING:", "Not a Unicode graphical code point");
return;
}
pline("Unicode Name:", charnames::viacode($n) || "-NOT FOUND-");
if ($quiet) { return; }
pline("Unicode Script: ", charscript(sprintf("U+%04x", $n)));
pline("Unicode Block: ", charblock(sprintf("U+%04x", $n)));
my $pnum = $n >> 16;
my $pname = "";
if ($pnum>=0 && $pnum<=16) {
$pname = $uPlanes[$pnum]->[0];
if ($dnd) { $pname .= " (aka " . $uPlanes[$pnum]->[1] . ")"; }
}
else {
$pname = "-UNKNOWN-";
}
pline("Unicode Plane:" , $pnum . ": " . $pname);
if ($n == 0xEFBFBD) {
pline("WARNING:", "UTF8 of U+FFFD (Replacement Character)?");
}
} # showUnicodeInfo
# Find a code point given its short name; mainly for control characters.
# For example, "DC1" => x11.
sub lookupAbbr {
my ($ab) = @_;
for my $n (keys %{$c0Ref}) {
my $mnem = getCharInfo($c0Ref, $n, 'Mnemonic');
if ($mnem eq $ab) {
return(hex("0x" . getCharInfo($c0Ref, $n, 'Hex')));
}
}
return(undef);
}
sub isURIchar {
my ($c) = @_;
my $rc = 0;
my $expr = "\$c =~ m/[-+A-Z_a-z0-9!\\\$&\'()*.\\\/:;=?\\\@]/";
$rc = eval($expr);
if ($@ ne "") {
warn "isURIchar: Bad Unicode character '$c'?\n" .
" Eval('$expr') said:\n $@\n";
return(0);
}
return($rc);
}
sub getBases {
my ($n) = @_;
return(sprintf("o%04o d%04d x%04x; U+%04x, utf-8 %s",
$n, $n, $n, $n, getUTF8($n, "\\x")));
}
###############################################################################
# Maintain arrays of names for the C0 and C1 control characters
# (See chart at bottom)
#
sub getCharInfo {
my ($hashRef, $n, $item) = @_;
my @tuple = @{$hashRef->{$n}};
if (!@tuple) { return(""); }
if ($item eq 'Literal') { return($tuple[0]); }
elsif ($item eq 'Hex') { return($tuple[1]); }
elsif ($item eq 'UEquiv') { return($tuple[2]); }
elsif ($item eq 'Mnemonic') { return($tuple[3]); } # !!
elsif ($item eq 'Entname') { return($tuple[3]); } # !!
elsif ($item eq 'Descr') { return($tuple[4]); }
else {
die "Bad data item name '$item'.\n";
}
}
###############################################################################
# Make a hash by hexcode, with an array of data items (see comment below).
# There are not HTML named character entities for these characters, so that
# column provides the usual mnemonic for the character instead.
#
sub getC0Data2 {
my %c0d = (
# [ Literal Hex UEquiv Mnemonic Descr ]
0x00 => [ chr(0x00), 0x00, 0x00, "NUL", "Null" ],
0x01 => [ chr(0x01), 0x01, 0x01, "SOH", "Start Of Heading" ],
0x02 => [ chr(0x02), 0x02, 0x02, "STX", "Start Of Text" ],
0x03 => [ chr(0x03), 0x03, 0x03, "ETX", "End Of Text" ],
0x04 => [ chr(0x04), 0x04, 0x04, "EOT", "End Of Transmission" ],
0x05 => [ chr(0x05), 0x05, 0x05, "ENQ", "Enquiry" ],
0x06 => [ chr(0x06), 0x06, 0x06, "ACK", "Acknowledge" ],
0x07 => [ chr(0x07), 0x07, 0x07, "BEL", "Bell" ],
0x08 => [ chr(0x08), 0x08, 0x08, "BS", "Backspace" ],
0x09 => [ chr(0x09), 0x09, 0x09, "HT", "Horizontal Tab" ],
0x0A => [ chr(0x0A), 0x0A, 0x0A, "LF", "Newline" ],
0x0B => [ chr(0x0B), 0x0B, 0x0B, "VT", "Vertical Tab" ],
0x0C => [ chr(0x0C), 0x0C, 0x0C, "FF", "Form Feed" ],
0x0D => [ chr(0x0D), 0x0D, 0x0D, "CR", "Carriage Return" ],
0x0E => [ chr(0x0E), 0x0E, 0x0E, "SO", "Shift Out" ],
0x0F => [ chr(0x0F), 0x0F, 0x0F, "SI", "Shift In" ],
0x10 => [ chr(0x10), 0x10, 0x10, "DLE", "Data Link Escape" ],
0x11 => [ chr(0x11), 0x11, 0x11, "DC1", "Device Control 1" ],
0x12 => [ chr(0x12), 0x12, 0x12, "DC2", "Device Control 2" ],
0x13 => [ chr(0x13), 0x13, 0x13, "DC3", "Device Control 3" ],
0x14 => [ chr(0x14), 0x14, 0x14, "DC4", "Device Control 4" ],
0x15 => [ chr(0x15), 0x15, 0x15, "NAK", "Negative Acknowledge" ],
0x16 => [ chr(0x16), 0x16, 0x16, "SYN", "Synchronous Idle" ],
0x17 => [ chr(0x17), 0x17, 0x17, "ETB", "End Of Transmission Block" ],
0x18 => [ chr(0x18), 0x18, 0x18, "CAN", "Cancel" ],
0x19 => [ chr(0x19), 0x19, 0x19, "EM", "End Of Medium" ],
0x1A => [ chr(0x1A), 0x1A, 0x1A, "SUB", "Substitute" ],
0x1B => [ chr(0x1B), 0x1B, 0x1B, "ESC", "Escape" ],
0x1C => [ chr(0x1C), 0x1C, 0x1C, "FS", "Field Separator" ],
0x1D => [ chr(0x1D), 0x1D, 0x1D, "GS", "Group Separator" ],
0x1E => [ chr(0x1E), 0x1E, 0x1E, "RS", "Record Separator" ],
0x1F => [ chr(0x1F), 0x1F, 0x1F, "US", "Unit Separator" ],
0x20 => [ chr(0x20), 0x20, 0x20, "SP", "Space" ],
);
for (my $i=0x00; $i<=0x20; $i++) { # Integrity check
my @tuple = @{$c0d{$i}};
if (!@tuple) {
die sprintf("getC0Data2: No entry for char %02x.\n", $i);
}
if ($i!=ord($tuple[0]) || $i!=$tuple[1] || $i!=$tuple[2]) {
die sprintf("getC0Data2: data conflict for char %02x.\n", $i);
}
}
return(\%c0d);
} # getC0Data2
###############################################################################
# PAD, HOP, and SGCI are unassigned in Unicode (acc. Wikipedia),
# The C1 area is the difference between Latin-1 and cp1252.
# There are not HTML named character entities for these characters, so that
# column provides the usual mnemonic for the character instead.
#
sub getC1Data2 {
my %c1d = (
# [ Literal Hex UEquiv Mnemonic Descr
0x80 => [ chr(0x80), 0x80, 0x80, "PAD?", "Padding Character", ],
0x81 => [ chr(0x81), 0x81, 0x81, "HOP?", "High Octet Preset", ],
0x82 => [ chr(0x82), 0x82, 0x82, "BPH", "Break Permitted Here", ],
0x83 => [ chr(0x83), 0x83, 0x83, "NBH", "No Break Here", ],
0x84 => [ chr(0x84), 0x84, 0x84, "IND", "Index", ],
0x85 => [ chr(0x85), 0x85, 0x85, "NEL", "Next Line", ],
0x86 => [ chr(0x86), 0x86, 0x86, "SSA", "Start of Selected Area", ],
0x87 => [ chr(0x87), 0x87, 0x87, "ESA", "End of Selected Area", ],
0x88 => [ chr(0x88), 0x88, 0x88, "HTS", "Horizontal Tab Set", ],
0x89 => [ chr(0x89), 0x89, 0x89, "HTJ", "Horizontal Tab Justified", ],
0x8A => [ chr(0x8A), 0x8A, 0x8A, "VTS", "Vertical Tab Set", ],
0x8B => [ chr(0x8B), 0x8B, 0x8B, "PLD", "Partial Line Forward", ],
0x8C => [ chr(0x8C), 0x8C, 0x8C, "PLU", "Partial Line Backward", ],
0x8D => [ chr(0x8D), 0x8D, 0x8D, "RI", "Reverse Line Feed", ],
0x8E => [ chr(0x8E), 0x8E, 0x8E, "SS2", "Single-Shift 2", ],
0x8F => [ chr(0x8F), 0x8F, 0x8F, "SS3", "Single-Shift 3", ],
0x90 => [ chr(0x90), 0x90, 0x90, "DCS", "Device Control String", ],
0x91 => [ chr(0x91), 0x91, 0x91, "PU1", "Private Use 1", ],
0x92 => [ chr(0x92), 0x92, 0x92, "PU2", "Private Use 2", ],
0x93 => [ chr(0x93), 0x93, 0x93, "STS", "Set Transmit State", ],
0x94 => [ chr(0x94), 0x94, 0x94, "CCH", "Cancel character", ],
0x95 => [ chr(0x95), 0x95, 0x95, "MW", "Message Waiting", ],
0x96 => [ chr(0x96), 0x96, 0x96, "SPA", "Start of Protected Area", ],
0x97 => [ chr(0x97), 0x97, 0x97, "EPA", "End of Protected Area", ],
0x98 => [ chr(0x98), 0x98, 0x98, "SOS", "Start of String", ],
0x99 => [ chr(0x99), 0x99, 0x99, "SGCI?", "Single Graphic Char Intro", ],
0x9A => [ chr(0x9A), 0x9A, 0x9A, "SCI", "Single Char Intro", ],
0x9B => [ chr(0x9B), 0x9B, 0x9B, "CSI", "Control Sequence Introducer", ],
0x9C => [ chr(0x9C), 0x9C, 0x9C, "ST", "String Terminator", ],
0x9D => [ chr(0x9D), 0x9D, 0x9D, "OSC", "OS Command", ],
0x9E => [ chr(0x9E), 0x9E, 0x9E, "*PVM", "Privacy Message", ], ### ??? FIX
0x9F => [ chr(0x9F), 0x9F, 0x9F, "APC", "Padding Character", ],
0xA0 => [ chr(0xA0), 0xA0, 0xA0, "NBS", "NO-BREAK SPACE", ],
);
for (my $i=0x80; $i<=0xA0; $i++) { # Integrity check
my @tuple = @{$c1d{$i}};
if (!@tuple) {
die sprintf("getC1Data2: No entry for char %02x.\n", $i);
}
if ($i!=ord($tuple[0]) || $i!=$tuple[1] || $i!=$tuple[2]) {
die sprintf("getC1Data2: data conflict for char %02x.\n", $i);
}
}
return(\%c1d);
} # getC1Data
###############################################################################
# PAD, HOP, and SGCI are unassigned in Unicode (acc. Wikipedia).
# This area is the difference between Latin-1 and cp1252.
#
sub getcp1252Data2 {
my %cp1252d = (
# [ Literal Hex UEquiv EntName Descr
0x80 => [ chr(0x80), 0x80, 0x20AC, "Euro", '', ],
0x81 => [ chr(0x81), 0x81, -1, "", '', ],
0x82 => [ chr(0x82), 0x82, 0x201A, "LowSQuo", '', ],
0x83 => [ chr(0x83), 0x83, 0x0192, "Florin", '', ],
0x84 => [ chr(0x84), 0x84, 0x201E, "LowDQuo", '', ],
0x85 => [ chr(0x85), 0x85, 0x2026, "hellip", '', ],
0x86 => [ chr(0x86), 0x86, 0x2020, "dagger", '', ],
0x87 => [ chr(0x87), 0x87, 0x2021, "Dagger", '', ],
0x88 => [ chr(0x88), 0x88, 0x02C6, "Cflex", '', ],
0x89 => [ chr(0x89), 0x89, 0x2030, "PerMil", '', ],
0x8A => [ chr(0x8A), 0x8A, 0x0160, "SCaron", '', ],
0x8B => [ chr(0x8B), 0x8B, 0x2039, "LAQuo", '', ],
0x8C => [ chr(0x8C), 0x8C, 0x0152, "OElig", '', ],
0x8D => [ chr(0x8D), 0x8D, -1, "", '', ],
0x8E => [ chr(0x8E), 0x8E, 0x017D, "ZCaron", '', ],
0x8F => [ chr(0x8F), 0x8F, -1, "", '', ],
0x90 => [ chr(0x90), 0x90, -1, "", '', ],
0x91 => [ chr(0x91), 0x91, 0x2018, "LSQuo", '', ],
0x92 => [ chr(0x92), 0x92, 0x2019, "RSQuo", '', ],
0x93 => [ chr(0x93), 0x93, 0x201C, "LDQuo", '', ],
0x94 => [ chr(0x94), 0x94, 0x201D, "RDQuo", '', ],
0x95 => [ chr(0x95), 0x95, 0x2022, "Bull", '', ],
0x96 => [ chr(0x96), 0x96, 0x2013, "enDash", '', ],
0x97 => [ chr(0x97), 0x97, 0x2014, "emDash", '', ],
0x98 => [ chr(0x98), 0x98, 0x02DC, "Tilde", '', ],
0x99 => [ chr(0x99), 0x99, 0x2122, "Trade", '', ],
0x9A => [ chr(0x9A), 0x9A, 0x0161, "sCaron", '', ],
0x9B => [ chr(0x9B), 0x9B, 0x203A, "RAQuo", '', ],
0x9C => [ chr(0x9C), 0x9C, 0x0153, "oelig", '', ],
0x9D => [ chr(0x9D), 0x9D, -1, "", '', ],
0x80 => [ chr(0x80), 0x80, 0x017E, "zCaron", '', ],
0x9F => [ chr(0x9F), 0x9F, 0x20AC, "Euro", '', ],
0xA0 => [ chr(0xA0), 0xA0, 0x00A0, "nbsp", '', ],
);
return(\%cp1252d);
} # getcp1252Data2
###############################################################################
# See http://en.wikipedia.org/wiki/Mac_OS_Roman
#
# Perl form for info on old MacRoman character set.
# See also other forms such as Python and XSV data.
# From 'chr' by Steven J. DeRose, as of 2015-09-15.
#
# Fields are:
# MacRoman code point as literal
# MacRoman hex code point
# Corresponding Unicode hex code point
# HTML entity name
# Unicode character name (in mixed case)
#
sub getMacRomanData2 {
my $uc = 'Latin capital letter ';
my $lc = 'Latin small letter ';
my $qm = ' quotation mark';
my %mac = (
# [ Lit Hex UEquiv EntName Descr
####### C1 #######
0x80 => [ chr(0x80), 0x80, 0x00C4, "Auml", $uc.'A with diaeresis', ],
0x81 => [ chr(0x81), 0x81, 0x00C5, "Aring", $uc.'A with ring above', ],
0x82 => [ chr(0x82), 0x82, 0x00C7, "Ccedil", $uc.'C with cedilla', ],
0x83 => [ chr(0x83), 0x83, 0x00C9, "Eacute", $uc.'E with acute', ],
0x84 => [ chr(0x84), 0x84, 0x00D1, "Ntilde", $uc.'N with tilde', ],
0x85 => [ chr(0x85), 0x85, 0x00D6, "Ouml", $uc.'O with diaeresis', ],
0x86 => [ chr(0x86), 0x86, 0x00DC, "Uuml", $uc.'U with diaeresis', ],
0x87 => [ chr(0x87), 0x87, 0x00E1, "aacute", $lc.'a with acute', ],
0x88 => [ chr(0x88), 0x88, 0x00E0, "agrave", $lc.'a with grave', ],
0x89 => [ chr(0x89), 0x89, 0x00E2, "acirc", $lc.'a with circumflex', ],
0x8A => [ chr(0x8A), 0x8A, 0x00E4, "auml", $lc.'a with diaeresis', ],
0x8B => [ chr(0x8B), 0x8B, 0x00E3, "atilde", $lc.'a with tilde', ],
0x8C => [ chr(0x8C), 0x8C, 0x00E5, "aring", $lc.'a with ring above', ],
0x8D => [ chr(0x8D), 0x8D, 0x00E7, "ccedil", $lc.'c with cedilla', ],
0x8E => [ chr(0x8E), 0x8E, 0x00E9, "eacute", $lc.'e with acute', ],
0x8F => [ chr(0x8F), 0x8F, 0x00E8, "egrave", $lc.'e with grave', ],
0x90 => [ chr(0x90), 0x90, 0x00EA, "ecirc", $lc.'e with circumflex', ],
0x91 => [ chr(0x91), 0x91, 0x00EB, "euml", $lc.'e with diaeresis', ],
0x92 => [ chr(0x92), 0x92, 0x00ED, "iacute", $lc.'i with acute', ],
0x93 => [ chr(0x93), 0x93, 0x00EC, "igrave", $lc.'i with grave', ],
0x94 => [ chr(0x94), 0x94, 0x00EE, "icirc", $lc.'i with circumflex', ],
0x95 => [ chr(0x95), 0x95, 0x00EF, "iuml", $lc.'i with diaeresis', ],
0x96 => [ chr(0x96), 0x96, 0x00F1, "ntilde", $lc.'n with tilde', ],
0x97 => [ chr(0x97), 0x97, 0x00F3, "oacute", $lc.'o with acute', ],
0x98 => [ chr(0x98), 0x98, 0x00F2, "ograve", $lc.'o with grave', ],
0x99 => [ chr(0x99), 0x99, 0x00F4, "ocirc", $lc.'o with circumflex', ],
0x9A => [ chr(0x9A), 0x9A, 0x00F6, "ouml", $lc.'o with diaeresis', ],
0x9B => [ chr(0x9B), 0x9B, 0x00F5, "otilde", $lc.'o with tilde', ],
0x9C => [ chr(0x9C), 0x9C, 0x00FA, "uacute", $lc.'u with acute', ],
0x9D => [ chr(0x9D), 0x9D, 0x00F9, "ugrave", $lc.'u with grave', ],
0x9E => [ chr(0x9E), 0x9E, 0x00FB, "ucirc", $lc.'u with circumflex', ],
0x9F => [ chr(0x9F), 0x9F, 0x00FC, "uuml", $lc.'u with diaeresis', ],
####### G1 #######
0xA0 => [ chr(0xA0), 0xA0, 0x2020, "dagger", 'Dagger', ],
0xA1 => [ chr(0xA1), 0xA1, 0x00B0, "deg", 'Degree sign', ],
0xA2 => [ chr(0xA2), 0xA2, 0x00A2, "cent", 'Cent sign', ],
0xA3 => [ chr(0xA3), 0xA3, 0x00A3, "pound", 'Pound sign', ],
0xA4 => [ chr(0xA4), 0xA4, 0x00A7, "sect", 'Section sign', ],
0xA5 => [ chr(0xA5), 0xA5, 0x2022, "bull", 'Bullet', ],
0xA6 => [ chr(0xA6), 0xA6, 0x00B6, "para", 'Pilcrow sign', ],
0xA7 => [ chr(0xA7), 0xA7, 0x00DF, "szlig", $lc.'sharp s', ],
0xA8 => [ chr(0xA8), 0xA8, 0x00AE, "reg", 'Registered sign', ],
0xA9 => [ chr(0xA9), 0xA9, 0x00A9, "copy", 'Copyright sign', ],
0xAA => [ chr(0xAA), 0xAA, 0x2122, "trade", 'Trade mark sign', ],
0xAB => [ chr(0xAB), 0xAB, 0x00B4, "acute", 'Acute accent', ],
0xAC => [ chr(0xAC), 0xAC, 0x00A8, "uml", 'Diaeresis', ],
0xAD => [ chr(0xAD), 0xAD, 0x2260, "ne", 'Not equal to', ],
0xAE => [ chr(0xAE), 0xAE, 0x00C6, "AElig", $uc.'AE', ],
0xAF => [ chr(0xAF), 0xAF, 0x00D8, "Oslash", $uc.'O with stroke', ],
0xB0 => [ chr(0xB0), 0xB0, 0x221E, "infin", 'Infinity', ],
0xB1 => [ chr(0xB1), 0xB1, 0x00B1, "plusmn", 'Plus-minus sign', ],
0xB2 => [ chr(0xB2), 0xB2, 0x2264, "le", 'Less-than or equal to', ],
0xB3 => [ chr(0xB3), 0xB3, 0x2265, "ge", 'Greater-than or equal to', ],
0xB4 => [ chr(0xB4), 0xB4, 0x00A5, "yen", 'Yen sign', ],
0xB5 => [ chr(0xB5), 0xB5, 0x00B5, "micro", 'Micro sign', ],
0xB6 => [ chr(0xB6), 0xB6, 0x2202, "part", 'Partial differential', ],
0xB7 => [ chr(0xB7), 0xB7, 0x2211, "sum", 'N-ary summation', ],
0xB8 => [ chr(0xB8), 0xB8, 0x220F, "prod", 'N-ary product', ],
0xB9 => [ chr(0xB9), 0xB9, 0x03C0, "pi", 'Greek small letter pi', ],
0xBA => [ chr(0xBA), 0xBA, 0x222B, "int", 'Integral', ],
0xBB => [ chr(0xBB), 0xBB, 0x00AA, "ordf", 'Feminine ordinal indicator', ],
0xBC => [ chr(0xBC), 0xBC, 0x00BA, "ordm", 'Masculine ordinal indicator', ],
0xBD => [ chr(0xBD), 0xBD, 0x03A9, "Omega", 'Greek capital letter Omega', ],
0xBE => [ chr(0xBE), 0xBE, 0x00E6, "aelig", $lc.'ae', ],
0xBF => [ chr(0xBF), 0xBF, 0x00F8, "oslash", $lc.'o with stroke', ],
0xC0 => [ chr(0xC0), 0xC0, 0x00BF, "iquest", 'Inverted question mark', ],
0xC1 => [ chr(0xC1), 0xC1, 0x00A1, "iexcl", 'Inverted exclamation mark', ],
0xC2 => [ chr(0xC2), 0xC2, 0x00AC, "not", 'Not sign', ],
0xC3 => [ chr(0xC3), 0xC3, 0x221A, "radic", 'Square root', ],
0xC4 => [ chr(0xC4), 0xC4, 0x0192, "fnof", $lc.'f with hook', ],
0xC5 => [ chr(0xC5), 0xC5, 0x2248, "asymp", 'Almost equal to', ],
0xC6 => [ chr(0xC6), 0xC6, 0x2206, "", 'Increment', ],
0xC7 => [ chr(0xC7), 0xC7, 0x00AB, "laquo", 'Left-pointing double angle'.$qm, ],
0xC8 => [ chr(0xC8), 0xC8, 0x00BB, "raquo", 'Right-pointing double angle'.$qm, ],
0xC9 => [ chr(0xC9), 0xC9, 0x2026, "hellip", 'Horizontal ellipsis', ],
0xCA => [ chr(0xCA), 0xCA, 0x00A0, "nbsp", 'No-break space', ],
0xCB => [ chr(0xCB), 0xCB, 0x00C0, "Agrave", $uc.'A with grave', ],
0xCC => [ chr(0xCC), 0xCC, 0x00C3, "Atilde", $uc.'A with tilde', ],
0xCD => [ chr(0xCD), 0xCD, 0x00D5, "Otilde", $uc.'O with tilde', ],
0xCE => [ chr(0xCE), 0xCE, 0x0152, "OElig", 'Latin capital ligature OE', ],
0xCF => [ chr(0xCF), 0xCF, 0x0153, "oelig", 'Latin small ligature oe', ],
0xD0 => [ chr(0xD0), 0xD0, 0x2013, "ndash", 'En dash', ],
0xD1 => [ chr(0xD1), 0xD1, 0x2014, "mdash", 'Em dash', ],
0xD2 => [ chr(0xD2), 0xD2, 0x201C, "ldquo", 'Left double'.$qm, ],
0xD3 => [ chr(0xD3), 0xD3, 0x201D, "rdquo", 'Right double'.$qm, ],
0xD4 => [ chr(0xD4), 0xD4, 0x2018, "lsquo", 'Left single'.$qm, ],
0xD5 => [ chr(0xD5), 0xD5, 0x2019, "rsquo", 'Right single'.$qm, ],
0xD6 => [ chr(0xD6), 0xD6, 0x00F7, "divide", 'Division sign', ],
0xD7 => [ chr(0xD7), 0xD7, 0x25CA, "loz", 'Lozenge', ],
0xD8 => [ chr(0xD8), 0xD8, 0x00FF, "yuml", $lc.'y with diaeresis', ],
0xD9 => [ chr(0xD9), 0xD9, 0x0178, "Yuml", $uc.'Y with diaeresis', ],
0xDA => [ chr(0xDA), 0xDA, 0x2044, "frasl", 'Fraction slash', ],
0xDB => [ chr(0xDB), 0xDB, 0x20AC, "euro", 'Euro sign', ],
0xDC => [ chr(0xDC), 0xDC, 0x2039, "lsaquo", 'Single left-pointing angle'.$qm, ],
0xDD => [ chr(0xDD), 0xDD, 0x203A, "rsaquo", 'Single right-pointing angle'.$qm, ],
0xDE => [ chr(0xDE), 0xDE, 0xFB01, "", 'Latin small ligature fi', ],
0xDF => [ chr(0xDF), 0xDF, 0xFB02, "", 'Latin small ligature fl', ],
0xE0 => [ chr(0xE0), 0xE0, 0x2021, "Dagger", 'Double dagger', ],
0xE1 => [ chr(0xE1), 0xE1, 0x00B7, "middot", 'Middle dot', ],
0xE2 => [ chr(0xE2), 0xE2, 0x201A, "sbquo", 'Single low-9'.$qm, ],
0xE3 => [ chr(0xE3), 0xE3, 0x201E, "bdquo", 'Double low-9'.$qm, ],
0xE4 => [ chr(0xE4), 0xE4, 0x2030, "permil", 'Per mille sign', ],
0xE5 => [ chr(0xE5), 0xE5, 0x00C2, "Acirc", $uc.'A with circumflex', ],
0xE6 => [ chr(0xE6), 0xE6, 0x00CA, "Ecirc", $uc.'E with circumflex', ],
0xE7 => [ chr(0xE7), 0xE7, 0x00C1, "Aacute", $uc.'A with acute', ],
0xE8 => [ chr(0xE8), 0xE8, 0x00CB, "Euml", $uc.'E with diaeresis', ],
0xE9 => [ chr(0xE9), 0xE9, 0x00C8, "Egrave", $uc.'E with grave', ],
0xEA => [ chr(0xEA), 0xEA, 0x00CD, "Iacute", $uc.'I with acute', ],
0xEB => [ chr(0xEB), 0xEB, 0x00CE, "Icirc", $uc.'I with circumflex', ],
0xEC => [ chr(0xEC), 0xEC, 0x00CF, "Iuml", $uc.'I with diaeresis', ],
0xED => [ chr(0xED), 0xED, 0x00CC, "Igrave", $uc.'I with grave', ],
0xEE => [ chr(0xEE), 0xEE, 0x00D3, "Oacute", $uc.'O with acute', ],
0xEF => [ chr(0xEF), 0xEF, 0x00D4, "Ocirc", $uc.'O with circumflex', ],
0xF0 => [ chr(0xF0), 0xF0, 0xF8FF, "", 'Apple logo', ],
0xF1 => [ chr(0xF1), 0xF1, 0x00D2, "Ograve", $uc.'O with grave', ],
0xF2 => [ chr(0xF2), 0xF2, 0x00DA, "Uacute", $uc.'U with acute', ],
0xF3 => [ chr(0xF3), 0xF3, 0x00DB, "Ucirc", $uc.'U with circumflex', ],
0xF4 => [ chr(0xF4), 0xF4, 0x00D9, "Ugrave", $uc.'U with grave', ],
0xF5 => [ chr(0xF5), 0xF5, 0x0131, "", $lc.'dotless i', ],
0xF6 => [ chr(0xF6), 0xF6, 0x02C6, "circ", 'Modifier letter circumflex accent', ],
0xF7 => [ chr(0xF7), 0xF7, 0x02DC, "tilde", 'Small tilde', ],
0xF8 => [ chr(0xF8), 0xF8, 0x00AF, "macr", 'Macron', ],
0xF9 => [ chr(0xF9), 0xF9, 0x02D8, "", 'Breve', ],
0xFA => [ chr(0xFA), 0xFA, 0x02D9, "", 'Dot above', ],
0xFB => [ chr(0xFB), 0xFB, 0x02DA, "", 'Ring above', ],
0xFC => [ chr(0xFC), 0xFC, 0x00B8, "cedil", 'Cedilla', ],
0xFD => [ chr(0xFD), 0xFD, 0x02DD, "", 'Double acute accent', ],
0xFE => [ chr(0xFE), 0xFE, 0x02DB, "", 'Ogonek', ],
0xFF => [ chr(0xFF), 0xFF, 0x02C7, "", 'Caron', ],
);
return(\%mac);
} # macRomanData2