Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accents #2404

Merged
merged 11 commits into from
Aug 25, 2024
Merged

Accents #2404

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/LaTeXML/Engine/TeX.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use warnings;
use LaTeXML::Package;
use Unicode::Normalize;
use LaTeXML::Util::Pathname;
use charnames ':full';
use List::Util qw(min max);

###$LaTeXML::DEBUG{compiled} = 1 unless $LaTeXML::DEBUG{compiling} || $LaTeXML::DEBUG{nocompiled};
Expand Down
13 changes: 11 additions & 2 deletions lib/LaTeXML/Engine/TeX_Box.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,23 @@ DefConstructor('\lx@hidden@egroup', '',
reversion => '');

#======================================================================
DefMacro('\lx@nounicode {}', '\ifmmode\lx@math@nounicode#1\else\lx@text@nounicode#1\fi');
# A few useful low-level boxing things

DefConstructor('\lx@framed[]{}',
"<ltx:text framed='#frame' _noautoclose='1'>#2</ltx:text>",
properties => { frame => sub { ToString($_[1] || 'rectangle'); } });

DefConstructor('\lx@hflipped{}',
"<ltx:text class='ltx_hflipped' _noautoclose='1'>#1</ltx:text>");

DefConstructor('\lx@overlay{}{}',
"<ltx:text class='ltx_overlay'>"
. "<ltx:text class='ltx_overlay_base' _noautoclose='1'>#1</ltx:text>"
. "<ltx:text class='ltx_overlay_over' _noautoclose='1'>#2</ltx:text></ltx:text>");

#======================================================================
DefMacro('\lx@nounicode {}', '\ifmmode\lx@math@nounicode#1\else\lx@text@nounicode#1\fi');

sub reportNoUnicode {
my ($cs) = @_;
$cs = ToString($cs);
Expand All @@ -77,7 +86,7 @@ DefPrimitive('\lx@math@nounicode DefToken', sub {
Box(ToString($cs), undef, undef, $cs, class => 'ltx_nounicode'); });

DefConstructor('\lx@text@nounicode DefToken',
"<ltx:text _no_autoclose='true' class='ltx_nounicode'>#1</ltx:text>",
"<ltx:text _noautoclose='true' class='ltx_nounicode'>#1</ltx:text>",
afterDigest => sub {
reportNoUnicode(ToString($_[1]->getArg(0))); });

Expand Down
48 changes: 18 additions & 30 deletions lib/LaTeXML/Engine/TeX_Character.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ package LaTeXML::Package::Pool;
use strict;
use warnings;
use LaTeXML::Package;
use Unicode::Normalize;
use LaTeXML::Util::Unicode;

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Character Family of primitive control sequences
Expand Down Expand Up @@ -70,43 +70,31 @@ sub applyAccent {

# Defines an accent command using a combining char that follows the
# 1st char of the argument. In cases where there is no argument, $standalonechar is used.
# Ideally, the pair match up with an entry in Util::Unicode's accents
sub DefAccent {
my ($accent, $combiningchar, $standalonechar, %options) = @_;
$options{above} = 1 if !(defined $options{above}) && !$options{below};
# Used for converting a char used as an above-accent to a combining char (See \accent)
AssignMapping('accent_combiner_above', $standalonechar => $combiningchar) if $options{above};
AssignMapping('accent_combiner_below', $standalonechar => $combiningchar) unless $options{above};
DefMacroI($accent, "{}",
Tokens(T_CS('\lx@applyaccent'), T_OTHER($accent),
T_OTHER($combiningchar), T_OTHER($standalonechar),
T_BEGIN, T_ARG(1), T_END),
$accent = T_CS($accent) unless ref $accent;
DefPrimitiveI($accent, "{}", sub {
my ($stomach, $letter) = @_;
applyAccent($stomach, $letter, $combiningchar, $standalonechar,
Tokens($accent, T_BEGIN, $letter, T_END)); },
protected => 1);
return; }

DefPrimitiveI('\lx@applyaccent', "DefToken Token Token {}", sub {
my ($stomach, $accent, $combiningchar, $standalonechar, $letter) = @_;
applyAccent($stomach, $letter, $combiningchar->getString, $standalonechar->getString,
Tokens(T_CS($accent->getString), T_BEGIN, $letter, T_END)); },
mode => 'text');

# This will fail if there really are "assignments" after the number!
# We're given a number pointing into the font, from which we can derive the standalone char.
# From that, we want to figure out the combining character, but there could be one for
# both the above & below cases! We'll prefer the above case.
# This will fail if there really are "assignments" after the number! (See TeX Book)
# We're given a number pointing into the font; the FontMap presumably has the standalone char.
# If there's no letter to be accented, just use the stanadalone.
# Otherwise, use the Util::Unicode module to find the appropriate combining character
DefPrimitive('\accent Number {}', sub {
my ($stomach, $num, $letter) = @_;
my $n = $num->valueOf;
my $fontinfo = lookupFontinfo(LookupValue('textfont_0'));
my $acc = ($fontinfo && $$fontinfo{encoding} ? FontDecode($n, $$fontinfo{encoding}) : chr($n));
my $reversion = Invocation(T_CS('\accent'), $num, $letter);
# NOTE: REVERSE LOOKUP in above accent list for the non-spacing accent char
# BUT, \accent always (?) makes an above type accent... doesn't it?
if (my $combiner = LookupMapping('accent_combiner_above', $acc)
|| LookupMapping('accent_combiner_below', $acc)) {
applyAccent($stomach, $letter, $combiner, $acc, $reversion); }
else {
Warn('unexpected', "accent$n", $stomach, "Accent '$n' not recognized");
Box(ToString($letter), undef, undef, $reversion); } });
my $encoding = LookupValue('font')->getEncoding || 'OT1';
my $char = ($encoding ? FontDecode($n, $encoding) : chr($n));
if (my $entry = unicode_accent($char)) {
applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
Invocation(T_CS('\accent'), $num, $letter)); }
else { # Unknown accent ? Attempt to OVERLAY the accent on top of $letter
Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($char), T_END)); } });

#======================================================================
# \chardef iq provides an alternate way to define a control sequence that returns a character.
Expand Down
43 changes: 17 additions & 26 deletions lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -140,37 +140,28 @@ DeclareFontMap('ASCII',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', "{", "|", "}", "~", undef]);

# Note that several entries are used for accents, and in practice will actually
# be used in something like an m:mover; thus they needn't (shouldn't?) be "small"
# There are also some questions about which choices are best
# grave & acute accents (entry 0x12 & 0x13) (often typed using 0x60 & 0x27)
# are probably best using U+60(grave accent) & U+B4(acute accent)
# but could be U+2035 (reversed prime) & U+2032 (prime). (particularly for math?)
# [we do use these for \prime, however!]
# or U+02CB (modifier letter grave accent) & U+02CA (modifier letter acute accent)
# Similarly, hat & tilde (entries 0x5E & 0x7E)
# typed using ^ 0x5E circumflex accent) & ~ 0x7E tilde
# are probably best just sticking with U+5E & U+7E
# but could be U+02C6 (modifier letter circumflex accent) U+02DC (small tilde)
# [Note that generally we're using codepoints characterized as "modifier letter"
# only when no other spacing point is available.]
# Note that several entries are used for accents.
# TeX fonts typically contain a standalone version of an accent, ie smallish & raised.
# We'll consult a table in LaTeXML::Util::Unicode to determine the equivalent combining character,
# as well as an "unwrapped" one for use in Math tokens (eg. as an overaccent)
# NOTE: 0x12--0x18, 0x5E-0x5F, 0x7D-0x7F are accents
DeclareFontMap('OT1',
["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}",
"\x{03A6}", "\x{03A8}", "\x{03A9}", "\x{FB00}", "\x{FB01}", "\x{FB02}", "\x{FB03}", "\x{FB04}",
"\x{0131}", "\x{0237}", UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}",
UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{152}", UTF(0xD8),
UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#', '$', '%', '&', "\x{2019}",
'(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', UTF(0xA1), '=', UTF(0xBF), '?',
'@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', '[', "\x{201C}", ']', "^", "\x{02D9}",
"\x{2018}", 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", UTF(0x7E), UTF(0xA8)]);
UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#', '$', '%', '&', "\x{2019}",
'(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', UTF(0xA1), '=', UTF(0xBF), '?',
'@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', '[', "\x{201C}", ']', "\x{02C6}", "\x{02D9}",
"\x{2018}", 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", "\x{02DC}", UTF(0xA8)]);

DeclareFontMap('OT1',
["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}",
Expand Down
11 changes: 8 additions & 3 deletions lib/LaTeXML/Engine/TeX_Math.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -618,14 +618,19 @@ DefPrimitive('\mathchardef Token SkipSpaces SkipMatch:=', sub {
return; });

DefConstructor('\mathaccent Number Digested',
"<ltx:XMApp><ltx:XMTok role='OVERACCENT'>#glyph</ltx:XMTok><ltx:XMArg>#2</ltx:XMArg></ltx:XMApp>",
"<ltx:XMApp><ltx:XMTok role='#accrole'>#glyph</ltx:XMTok><ltx:XMArg>#2</ltx:XMArg></ltx:XMApp>",
sizer => '#2', # Close enough?
afterDigest => sub {
my ($stomach, $whatsit) = @_;
my $n = $whatsit->getArg(1)->valueOf;
my ($role, $glyph) = decodeMathChar($n);
$whatsit->setProperty(glyph => $glyph) if $glyph;
$whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
my $accrole = 'OVERACCENT';
if (my $entry = unicode_accent($glyph)) {
$glyph = $$entry{unwrapped};
$accrole = $$entry{role}; }
$whatsit->setProperty(glyph => $glyph) if $glyph;
$whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
$whatsit->setProperty(accrole => $accrole) if $glyph;
return; });

# # Only used for active math characters, so far
Expand Down
15 changes: 8 additions & 7 deletions lib/LaTeXML/Engine/plain.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ package LaTeXML::Package::Pool;
use strict;
use warnings;
use LaTeXML::Package;
use charnames ':full';
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#**********************************************************************
Expand Down Expand Up @@ -696,21 +697,21 @@ DefPrimitiveI('\pounds', undef, UTF(0xA3)); # POUND

DefAccent('\`', "\x{0300}", UTF(0x60)); # COMBINING GRAVE ACCENT & GRAVE ACCENT
DefAccent("\\'", "\x{0301}", UTF(0xB4)); # COMBINING ACUTE ACCENT & ACUTE ACCENT
DefAccent('\^', "\x{0302}", UTF(0x5E)); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT
DefAccent('\^', "\x{0302}", "\x{02C6}"); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT
DefAccent('\"', "\x{0308}", UTF(0xA8)); # COMBINING DIAERESIS & DIAERESIS
DefAccent('\~', "\x{0303}", "~"); # COMBINING TILDE
DefAccent('\~', "\x{0303}", "\x{02DC}"); # COMBINING TILDE
DefAccent('\=', "\x{0304}", UTF(0xAF)); # COMBINING MACRON & MACRON
DefAccent('\.', "\x{0307}", "\x{02D9}"); # COMBINING DOT ABOVE & DOT ABOVE
DefAccent('\u', "\x{0306}", "\x{02D8}"); # COMBINING BREVE & BREVE
DefAccent('\v', "\x{030C}", "\x{02C7}"); # COMBINING CARON & CARON
DefAccent('\@ringaccent', "\x{030A}", "o"); # COMBINING RING ABOVE & non-combining
DefAccent('\r', "\x{030A}", "o"); # COMBINING RING ABOVE & non-combining
DefAccent('\@ringaccent', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining
DefAccent('\r', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining
DefAccent('\H', "\x{030B}", "\x{02DD}"); # COMBINING DOUBLE ACUTE ACCENT & non-combining
DefAccent('\c', "\x{0327}", UTF(0xB8), below => 1); # COMBINING CEDILLA & CEDILLA
# NOTE: The next two get define for math, as well; See below
DefAccent('\@text@daccent', "\x{0323}", '.', below => 1); # COMBINING DOT BELOW & DOT (?)
DefAccent('\@text@baccent', "\x{0331}", UTF(0xAF), below => 1); # COMBINING MACRON BELOW & MACRON
DefAccent('\t', "\x{0361}", "-"); # COMBINING DOUBLE INVERTED BREVE & ???? What????
DefAccent('\@text@daccent', "\x{0323}", '.', below => 1); # COMBINING DOT BELOW & DOT (?)
DefAccent('\@text@baccent', "\x{0331}", '_', below => 1); # COMBINING MACRON BELOW & MACRON
DefAccent('\t', "\x{0361}", "\N{NBSP}\x{0361}"); # COMBINING DOUBLE INVERTED BREVE & ???? What????
# this one's actually defined in mathscinet.sty, but just stick it here!
DefAccent('\lfhook', "\x{0326}", ",", below => 1); # COMBINING COMMA BELOW
# I doubt that latter covers multiple chars...?
Expand Down
2 changes: 1 addition & 1 deletion lib/LaTeXML/Package/cleveref.sty.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ sub crefMulti {
return @tokens; } }
# Since we're not grouping by type, we're ignoring \crefpairgroupconjunction, etc

DefConstructor('\lx@cref OptionalMatch:* {} Semiverbatim',
DefConstructor('\lx@cref OptionalMatch:* HyperVerbatim Semiverbatim',
"<ltx:ref labelref='#label' show='#2' ?#1(class='ltx_nolink')() _force_font='true'/>",
properties => sub { (label => CleanLabel($_[3])); });

Expand Down
6 changes: 3 additions & 3 deletions lib/LaTeXML/Package/textcomp.sty.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ DefAccent('\capitalacute', "\x{0301}", UTF(0xB4)); # \'
DefAccent('\capitalbreve', "\x{0306}", "\x{02D8}"); # \u
DefAccent('\capitalcaron', "\x{030C}", "\x{02C7}"); # \v
DefAccent('\capitalcedilla', "\x{0327}", UTF(0xB8), below => 1); # \c
DefAccent('\capitalcircumflex', "\x{0302}", UTF(0x5E)); # \^
DefAccent('\capitalcircumflex', "\x{0302}", "\x{02C6}"); # \^
DefAccent('\capitaldieresis', "\x{0308}", UTF(0xA8)); # \"
DefAccent('\capitaldotaccent', "\x{0307}", "\x{02D9}"); # \.
DefAccent('\capitalgrave', "\x{0300}", UTF(0x60)); # \`
DefAccent('\capitalhungarumlaut', "\x{030B}", "\x{02DD}"); # \H
DefAccent('\capitalmacron', "\x{0304}", UTF(0xAF)); # \=
DefAccent('\capitalnewtie', "\x{0361}", "-"); # \t
DefAccent('\capitalogonek', "\x{0328}", "\x{02DB}"); #
DefAccent('\capitalring', "\x{030A}", "o"); # \r
DefAccent('\capitalring', "\x{030A}", "\x{02DA}"); # \r
DefAccent('\capitaltie', "\x{0361}", "-"); # \t
DefAccent('\capitaltilde', "\x{0303}", "~"); # \~
DefAccent('\capitaltilde', "\x{0303}", "\x{02DC}"); # \~
DefAccent('\newtie', "\x{0361}", "-"); # \t

#======================================================================
Expand Down
4 changes: 2 additions & 2 deletions lib/LaTeXML/Post.pm
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ use LaTeXML::Post;
use LaTeXML::Common::Error;
use base qw(LaTeXML::Post::Processor);
use LaTeXML::Common::XML;
use charnames ':full';

# This is an abstract class; A complete MathProcessor will need to define:
# $self->convertNode($doc,$xmath)
Expand Down Expand Up @@ -451,7 +452,6 @@ sub combineParallel {
# AND the nested math needs to be converted to ONLY the current target's markup
# NOT parallel within each nested math, although it should still be cross-referencable to others!
# moreover, the math will need the outerWrapper.
my $NBSP = pack('U', 0xA0); # CONSTANT

sub convertXMTextContent {
my ($self, $doc, $convertspaces, @nodes) = @_;
Expand All @@ -460,7 +460,7 @@ sub convertXMTextContent {
if ($node->nodeType == XML_TEXT_NODE) {
my $string = $node->textContent;
if ($convertspaces) {
$string =~ s/^\s+/$NBSP/; $string =~ s/\s+$/$NBSP/; }
$string =~ s/^\s+/\N{NBSP}/; $string =~ s/\s+$/\N{NBSP}/; }
push(@result, $string); }
else {
my $tag = $doc->getQName($node);
Expand Down
6 changes: 2 additions & 4 deletions lib/LaTeXML/Post/CrossRef.pm
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ use charnames qw(:full);
use LaTeXML::Post;
use base qw(LaTeXML::Post::Processor);

my $NBSP = pack('U', 0xA0); # CONSTANT

sub new {
my ($class, %options) = @_;
my $self = $class->SUPER::new(%options);
Expand Down Expand Up @@ -635,7 +633,7 @@ sub make_bibcite {
elsif ($show =~ s/^\{([^\}]*)\}//) { # pass-thru literal, quoted with {}
push(@stuff, $1) if $1; }
elsif ($show =~ s/^~//) { # Pass-thru spaces
push(@stuff, $NBSP) if @stuff; }
push(@stuff, "\N{NBSP}") if @stuff; }
elsif ($show =~ s/^(\s+)//) { # Pass-thru spaces
push(@stuff, $1) if @stuff; }
elsif ($show =~ s/^(\W+)//) { # Pass-thru non show keywords
Expand Down Expand Up @@ -782,7 +780,7 @@ sub generateRef_aux {
elsif ($show =~ s/^\{([^\}]*)\}//) { # pass-thru literal, quoted with {}
push(@stuff, $1) if $1; }
elsif ($show =~ s/^~//) { # Pass-thru spaces
push(@stuff, $NBSP) if @stuff; }
push(@stuff, "\N{NBSP}") if @stuff; }
elsif ($show =~ s/^(\s+)//) { # Pass-thru spaces
push(@stuff, $1) if @stuff; }
elsif ($show =~ s/^(\W+)//) { # Pass-thru non show keywords
Expand Down
13 changes: 6 additions & 7 deletions lib/LaTeXML/Post/MathML.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ use LaTeXML::Util::Unicode;
use LaTeXML::Post;
use LaTeXML::Common::Font;
use List::Util qw(max);
use base qw(LaTeXML::Post::MathProcessor);
use base qw(Exporter);
use charnames ':full';
use base qw(LaTeXML::Post::MathProcessor);
use base qw(Exporter);
our @EXPORT = (
qw( &DefMathML ),
qw( &pmml &pmml_scriptsize &pmml_smaller
Expand Down Expand Up @@ -383,8 +384,6 @@ sub getXMHintSpacing {
else {
return 0; } }

my $NBSP = pack('U', 0xA0); # CONSTANT

sub pmml_internal {
no warnings 'recursion';
my ($node) = @_;
Expand Down Expand Up @@ -506,8 +505,8 @@ sub pmml_internal {
['m:mtext', {}, $node->textContent]]; }
else {
my $text = $node->textContent; # Spaces are significant here
$text =~ s/^\s+/$NBSP/;
$text =~ s/\s+$/$NBSP/;
$text =~ s/^\s+/\N{NBSP}/;
$text =~ s/\s+$/\N{NBSP}/;
return ['m:mtext', {}, $text]; } }

sub needsMathstyle {
Expand Down Expand Up @@ -1027,7 +1026,7 @@ sub pmml_text_aux {
my $type = $node->nodeType;
if ($type == XML_TEXT_NODE) {
my ($string, %mmlattr) = stylizeContent($node, 'm:mtext', %attr);
$string =~ s/^\s+/$NBSP/; $string =~ s/\s+$/$NBSP/;
$string =~ s/^\s+/\N{NBSP}/; $string =~ s/\s+$/\N{NBSP}/;
return ['m:mtext', {%mmlattr}, $string]; }
elsif ($type == XML_DOCUMENT_FRAG_NODE) {
return map { pmml_text_aux($_, %attr) } $node->childNodes; }
Expand Down
Loading
Loading