From fe9620cf6e8d756886a97754806cae8066235bec Mon Sep 17 00:00:00 2001
From: Chris B <cbieneman@microsoft.com>
Date: Wed, 8 Nov 2023 09:36:31 -0600
Subject: [PATCH] Basic lexing for preprocessor tokens (#115)

This change adds some macros to format grammar definitions similar to
how the C++ standard does. It then uses the new macros in the new lexing
sections for:

* Character Sets
* Preprocessing Tokens
* Tokens
* Comments
* Header Names

This is all extremely consistent with C/C++ circa 2011. The deviations
will start in the next sections.
---
 specs/language/hlsl.tex   |   3 +
 specs/language/lex.tex    | 138 ++++++++++++++++++++++++++++++++++++++
 specs/language/macros.tex |  36 ++++++++++
 3 files changed, 177 insertions(+)
 create mode 100644 specs/language/macros.tex

diff --git a/specs/language/hlsl.tex b/specs/language/hlsl.tex
index 92309c33..d2a7eeb9 100644
--- a/specs/language/hlsl.tex
+++ b/specs/language/hlsl.tex
@@ -8,6 +8,7 @@
 \usepackage{marginnote}
 \usepackage{parskip}
 \usepackage{titlesec}
+\usepackage{enumitem}
 
 \titleformat{\chapter}
   {\LARGE\bfseries}{\thechapter}{10pt}{}
@@ -60,6 +61,8 @@
 }{}
 
 \begin{document}
+\input{macros} 
+
 \maketitle
 
 \tableofcontents
diff --git a/specs/language/lex.tex b/specs/language/lex.tex
index 389f77fe..165247d9 100644
--- a/specs/language/lex.tex
+++ b/specs/language/lex.tex
@@ -46,3 +46,141 @@
   \item External references are resolved, library references linked, and all
   translation output is collected into a single output.
 \end{enumerate}
+
+\Sec{Character Sets}{Lex.CharSet}
+
+\p The \textit{basic source character set} is a subset of the ASCII character set.
+The table below lists the valid characters and their ASCII values:
+
+\begin{center}
+  \begin{tabular}{|| c | c | c ||}
+    \hline
+    Hex ASCII Value & Character Name & Glyph or C Escape Sequence \\
+    \hline
+    0x09 & Horizontal Tab & \texttt{\textbackslash t} \\
+    0x0A & Line Feed & \texttt{\textbackslash n} \\
+    0x0D & Carriage Return & \texttt{\textbackslash r} \\
+    0x20 & Space & \\
+    0x21 & Exclamation Mark & \texttt{!}\\
+    0x22 & Quotation Mark & \texttt{"}\\
+    0x23 & Number Sign & \texttt{\#}\\
+    0x25 & Percent Sign & \texttt{\%}\\
+    0x26 & Ampersand & \texttt{\&}\\
+    0x27 & Apostrophe & \texttt{'}\\
+    0x28 & Left Parenthesis & \texttt{(}\\
+    0x29 & Right Parenthesis & \texttt{)}\\
+    0x2A & Asterisk & \texttt{*}\\
+    0x2B & Plus Sign & \texttt{+}\\
+    0x2C & Comma & \texttt{,}\\
+    0x2D & Hyphen-Minus & \texttt{-}\\
+    0x2E & Full Stop & \texttt{.}\\
+    0x2F & Solidus & \texttt{/}\\
+    0x30 .. 0x39 & Digit Zero .. Nine & \texttt{0 1 2 3 4 5 6 7 8 9}\\
+    0x3A & Colon & \texttt{:}\\
+    0x3B & Semicolon & \texttt{;}\\
+    0x3C & Less-than Sign & \texttt{<}\\
+    0x3D & Equals Sign & \texttt{=}\\
+    0x3E & Greater-than Sign & \texttt{>}\\
+    0x3F & Question Mark & \texttt{?}\\
+    0x41 .. 0x5A & Latin Capital Letter A .. Z &
+        \texttt{A B C D E F G H I J K L M}\\
+    & & \texttt{N O P Q R S T U V W X Y Z}\\
+    0x5B & Left Square Bracket & \texttt{[}\\
+    0x5C & Reverse Solidus & \texttt{\textbackslash}\\
+    0x5D & Right Square Bracket & \texttt{[}\\
+    0x5E & Circumflex Accent & \texttt{\textasciicircum}\\
+    0x5F & Underscore & \texttt{\_}\\
+    0x61 .. 0x7A & Latin Small Letter a .. z &
+        \texttt{a b c d e f g h i j k l m}\\
+    & & \texttt{n o p q r s t u v w x y z}\\
+    0x7B & Left Curly Bracket & \texttt{\{}\\
+    0x7C & Vertical Line & \texttt{|}\\
+    0x7D & Right Curly Bracket & \texttt{\}}\\
+    \hline
+  \end{tabular}
+\end{center}
+
+\p An implementation may allow source files to be written in alternate
+\textit{extended character sets} as long as that set is a superset of the
+\textit{basic character set}. The \textit{translation character set} is an
+\textit{extended character set} or the \textit{basic character set} as chosen by
+the implementation.
+
+\Sec{Preprocessing Tokens}{Lex.PPTokens}
+
+\begin{grammar}
+  \define{preprocessing-token}\br
+  header-name\br
+  identifier\br
+  pp-number\br
+  character-literal\br
+  string-literal\br
+  preprocessing-op-or-punc\br
+  \textnormal{each non-whitespace character from the \textit{translation
+  character set} that cannot be one of the above}
+\end{grammar}\footnote{The preprocessor is inherited from C++ 11 with no
+grammar extensions. It is specified here only for completeness.}
+
+\p Each preprocessing token that is converted to a token shall have the lexical
+form of a keyword, an identifier, a constant, a string literal or an operator or
+punctuator.
+
+\p Preprocessing tokens are the minimal lexical elements of the language during
+translation phases 3 through 6 (\ref{Lex.Phases}). Preprocessing tokens can be
+separated by whitespace in the form of comments, white space characters, or
+both. White space may appear within a preprocessing token only as part of a
+header name or between the quotation characters in a character constant or
+string literal.
+
+\p Header name preprocessing tokens are only recognized within
+\texttt{\#include} preprocessing directives, \texttt{\_\_has\_include} expressions,
+and implementation-defined locations within \texttt{\#pragma} directives. In
+those contexts, a sequence of characters that could be either a header name or a
+string literal is recognized as a header name.
+
+\Sec{Tokens}{Lex.Tokens}
+
+\begin{grammar}
+  \define{token}\br
+  identifier\br
+  keyword\br
+  literal\br
+  operator-or-punctuator
+\end{grammar}
+
+\p There are five kinds of tokens: identifiers, keywords, literals, and
+operators or punctuators. All whitespace characters and comments are ignored
+except as they separate tokens.
+
+\Sec{Comments}{Lex.Comments}
+
+\p The characters \texttt{/*} start a comment which terminates with the
+characters \texttt{*\textbackslash}. The characters \texttt{//} start a comment
+which terminates at the next new line.
+
+\Sec{Header Names}{Lex.Headers}
+
+\begin{grammar}
+  \define{header-name}\br
+  \texttt{<} h-char-sequence \texttt{>}\br
+  \texttt{"} h-char-sequence \texttt{"}
+
+  \define{h-char-sequence}\br
+  h-char\br
+  h-char-sequence h-char
+
+  \define{h-char}\br
+  \textnormal{any character in the \textit{translation character set} except
+  newline or \texttt{>}}
+
+  \define{q-char-sequence}\br
+  q-char\br
+  q-char-sequence q-char
+
+  \define{q-char}\br
+  \textnormal{any character in the \textit{translation character set} except
+  newline or \texttt{"}}
+\end{grammar}
+
+\p Character sequences in header names are mapped to header files or external
+source file names in an implementation defined way.
diff --git a/specs/language/macros.tex b/specs/language/macros.tex
new file mode 100644
index 00000000..c80e0bd2
--- /dev/null
+++ b/specs/language/macros.tex
@@ -0,0 +1,36 @@
+%%------------------------------------------------------------------------------
+%% Grammar formatting
+%%------------------------------------------------------------------------------
+
+\newlength{\GrammarIndent}
+\setlength{\GrammarIndent}{\leftmargini}
+\newlength{\GrammarInc}
+\setlength{\GrammarInc}{\GrammarIndent}
+\newlength{\GrammarRest}
+\setlength{\GrammarRest}{2\GrammarIndent}
+
+\newenvironment{grammar} {
+  \newcommand{\define}[1]{{\textit{##1}\textnormal{:}}}
+  \newcommand{\terminal}[1]{{\textnormal{##1}}}
+  \newcommand{\keyword}[1]{\texttt{##1}}
+  \newcommand{\br}{\hfill\\*}
+
+  \renewcommand{\texttt}[1]{{\small\ttfamily\upshape ##1}}
+
+  \newcommand{\grammarindentfirst}{\GrammarIndent}
+  \newcommand{\grammarindentinc}{\GrammarInc}
+  \newcommand{\grammarindentrest}{\GrammarRest}
+  \itshape
+
+  \begin{grammarlist}
+  \item\relax
+}{
+  \end{grammarlist}
+}
+
+\newlist{grammarlist}{itemize}{1}
+\setlist[grammarlist]{
+  parsep=1ex, partopsep=0pt, itemsep=0pt, topsep=0pt, label={},
+  leftmargin=\grammarindentrest, listparindent=-\grammarindentinc,
+  itemindent=\listparindent
+}