Reject invalid string literal whitespace on unescape (#793)

This is based on discussion on #732: that we should probably parse the invalid whitespace, then reject it as part of string validation, rather than having different parses. I worry the question of "how is this parsed" may lead to subtly unexpected results if we aren't consistent, so I'm switching the logic from the lexer to the unescape library (and also adjusting the list of rejected whitespace).
carbon-language · Aug 30, 2021 · 36c9b28 · 36c9b28
1 parent e3a56e2
commit 36c9b28
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 70 deletions.
diff --git a/.bazelignore b/.bazelignore
diff --git a/common/string_helpers.cpp b/common/string_helpers.cpp
@@ -27,58 +27,68 @@ auto UnescapeStringLiteral(llvm::StringRef source)
   size_t i = 0;
   while (i < source.size()) {
     char c = source[i];
-    if (c == '\\') {
-      ++i;
-      if (i == source.size()) {
-        return std::nullopt;
-      }
-      switch (source[i]) {
-        case 'n':
-          ret.push_back('\n');
-          break;
-        case 'r':
-          ret.push_back('\r');
-          break;
-        case 't':
-          ret.push_back('\t');
-          break;
-        case '0':
-          if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
-            // \0[0-9] is reserved.
-            return std::nullopt;
-          }
-          ret.push_back('\0');
-          break;
-        case '"':
-          ret.push_back('"');
-          break;
-        case '\'':
-          ret.push_back('\'');
-          break;
-        case '\\':
-          ret.push_back('\\');
-          break;
-        case 'x': {
-          i += 2;
-          if (i >= source.size()) {
-            return std::nullopt;
+    switch (c) {
+      case '\\':
+        ++i;
+        if (i == source.size()) {
+          return std::nullopt;
+        }
+        switch (source[i]) {
+          case 'n':
+            ret.push_back('\n');
+            break;
+          case 'r':
+            ret.push_back('\r');
+            break;
+          case 't':
+            ret.push_back('\t');
+            break;
+          case '0':
+            if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
+              // \0[0-9] is reserved.
+              return std::nullopt;
+            }
+            ret.push_back('\0');
+            break;
+          case '"':
+            ret.push_back('"');
+            break;
+          case '\'':
+            ret.push_back('\'');
+            break;
+          case '\\':
+            ret.push_back('\\');
+            break;
+          case 'x': {
+            i += 2;
+            if (i >= source.size()) {
+              return std::nullopt;
+            }
+            std::optional<char> c1 = FromHex(source[i - 1]);
+            std::optional<char> c2 = FromHex(source[i]);
+            if (c1 == std::nullopt || c2 == std::nullopt) {
+              return std::nullopt;
+            }
+            ret.push_back(16 * *c1 + *c2);
+            break;
           }
-          std::optional<char> c1 = FromHex(source[i - 1]);
-          std::optional<char> c2 = FromHex(source[i]);
-          if (c1 == std::nullopt || c2 == std::nullopt) {
+          case 'u':
+            FATAL() << "\\u is not yet supported in string literals";
+          default:
+            // Unsupported.
             return std::nullopt;
-          }
-          ret.push_back(16 * *c1 + *c2);
-          break;
         }
-        case 'u':
-          FATAL() << "\\u is not yet supported in string literals";
-        default:
-          // Unsupported.
-          return std::nullopt;
-      }
-    } else {
-      ret.push_back(c);
+        break;
+
+      case '\t':
+        // Disallow non-` ` horizontal whitespace:
+        // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/whitespace.md
+        // TODO: This doesn't handle unicode whitespace.
+        return std::nullopt;
+
+      default:
+        ret.push_back(c);
+        break;
     }
     ++i;
   }

diff --git a/common/string_helpers_test.cpp b/common/string_helpers_test.cpp
@@ -17,6 +17,8 @@ namespace {
 
 TEST(UnescapeStringLiteral, Valid) {
   EXPECT_THAT(UnescapeStringLiteral("test"), Optional(Eq("test")));
+  EXPECT_THAT(UnescapeStringLiteral("okay whitespace"),
+              Optional(Eq("okay whitespace")));
   EXPECT_THAT(UnescapeStringLiteral("test\n"), Optional(Eq("test\n")));
   EXPECT_THAT(UnescapeStringLiteral("test\\n"), Optional(Eq("test\n")));
   EXPECT_THAT(UnescapeStringLiteral("abc\\ndef"), Optional(Eq("abc\ndef")));

diff --git a/docs/design/lexical_conventions/string_literals.md b/docs/design/lexical_conventions/string_literals.md
@@ -83,8 +83,9 @@ A _simple string literal_ is formed of a sequence of:
 
 -   Characters other than `\` and `"`.
     -   Only space characters (U+0020) are valid whitespace in a string literal.
-        Other whitespace, including tabs and newlines, are disallowed but parse
-        as part of the string for error recovery purposes.
+    -   Other [horizontal whitespace](whitespace.md), including tabs, are
+        disallowed but parse as part of the string for error recovery purposes.
+    -   Vertical whitespace will not parse as part of a simple string literal.
 -   [Escape sequences](#escape-sequences).
     -   Each escape sequence is replaced with the corresponding character
         sequence or code unit sequence.

diff --git a/docs/design/lexical_conventions/whitespace.md b/docs/design/lexical_conventions/whitespace.md
@@ -27,17 +27,19 @@ Unicode Annex #31 suggests selecting whitespace characters based on the
 characters with Unicode property `Pattern_White_Space`, which is currently these
 11 characters:
 
--   U+0009 CHARACTER TABULATION (horizontal tab)
--   U+000A LINE FEED (traditional newline)
--   U+000B LINE TABULATION (vertical tab)
--   U+000C FORM FEED (page break)
--   U+000D CARRIAGE RETURN
--   U+0020 SPACE
--   U+0085 NEXT LINE (Unicode newline)
--   U+200E LEFT-TO-RIGHT MARK
--   U+200F RIGHT-TO-LEFT MARK
--   U+2028 LINE SEPARATOR
--   U+2029 PARAGRAPH SEPARATOR
+-   Horizontal whitespace:
+    -   U+0009 CHARACTER TABULATION (horizontal tab)
+    -   U+0020 SPACE
+    -   U+200E LEFT-TO-RIGHT MARK
+    -   U+200F RIGHT-TO-LEFT MARK
+-   Vertical whitespace:
+    -   U+000A LINE FEED (traditional newline)
+    -   U+000B LINE TABULATION (vertical tab)
+    -   U+000C FORM FEED (page break)
+    -   U+000D CARRIAGE RETURN
+    -   U+0085 NEXT LINE (Unicode newline)
+    -   U+2028 LINE SEPARATOR
+    -   U+2029 PARAGRAPH SEPARATOR
 
 The quantity and kind of whitespace separating tokens is ignored except where
 otherwise specified.

diff --git a/executable_semantics/syntax/lexer.lpp b/executable_semantics/syntax/lexer.lpp
@@ -79,12 +79,14 @@ WHILE                "while"
 identifier            [A-Za-z_][A-Za-z0-9_]*
 sized_type_literal    [iuf][1-9][0-9]*
 integer_literal       [0-9]+
-string_literal        \"([^\\\"\n\t]|\\.)*\"
 horizontal_whitespace [ \t\r]
 whitespace            [ \t\r\n]
 one_line_comment      \/\/[^\n]*\n
 operand_start         [(A-Za-z0-9_"]
 
+/* Single-line string literals should reject vertical whitespace. */
+string_literal        \"([^\\\"\n\v\f\r]|\\.)*\"
+
 %{
   // This macro is expanded immediately before each action specified below.
   //

diff --git a/executable_semantics/testdata/string_fail6.golden b/executable_semantics/testdata/string_fail6.golden
@@ -1,2 +1,2 @@
-COMPILATION ERROR: executable_semantics/testdata/string_fail6.carbon:6: invalid character '\x22' in source file.
+COMPILATION ERROR: executable_semantics/testdata/string_fail6.carbon:6: Invalid escaping in string: "new	line"
 EXIT CODE: 255