Skip to content

Commit

Permalink
EASY: Allow Unicode line terminators in strings
Browse files Browse the repository at this point in the history
Summary:
ES10 allows U+2028 (LINE SEPARATOR), U+2029 (PARAGRAPH SEPARATOR) in
string literals for compatibility with JSON. Remove the existing check.
Add a test.

Github: Closes #235

Reviewed By: avp

Differential Revision: D21386937

fbshipit-source-id: 0cbf2d2ff449c33f6f1f3b42ad28ed3650650b36
  • Loading branch information
tmikov authored and facebook-github-bot committed May 5, 2020
1 parent 4e1a14b commit d304cb5
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 18 deletions.
13 changes: 1 addition & 12 deletions lib/Parser/JSLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,6 @@ const char *g_tokenStr[] = {

const int UTF8_LINE_TERMINATOR_CHAR0 = 0xe2;

inline bool matchUnicodeLineTerminator(const char *curCharPtr_) {
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
return (unsigned char)curCharPtr_[0] == UTF8_LINE_TERMINATOR_CHAR0 &&
(unsigned char)curCharPtr_[1] == 0x80 &&
((unsigned char)curCharPtr_[2] == 0xa8 ||
(unsigned char)curCharPtr_[2] == 0xa9);
}

inline bool matchUnicodeLineTerminatorOffset1(const char *curCharPtr_) {
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
Expand Down Expand Up @@ -1385,9 +1376,7 @@ void JSLexer::scanString() {
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
break;
}
} else if (LLVM_UNLIKELY(
*curCharPtr_ == '\n' || *curCharPtr_ == '\r' ||
matchUnicodeLineTerminator(curCharPtr_))) {
} else if (LLVM_UNLIKELY(*curCharPtr_ == '\n' || *curCharPtr_ == '\r')) {
error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
sm_.note(token_.getStartLoc(), "string started here");
break;
Expand Down
40 changes: 34 additions & 6 deletions unittests/Parser/JSLexerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,7 @@ TEST(JSLexerTest, StringTest1) {

JSLexer lex(
"'aa' \"bb\" 'open1\n"
"'open2\xe2\x80\xa8"
"\"open3",
"\"open2",
sm,
alloc);

Expand All @@ -402,13 +401,42 @@ TEST(JSLexerTest, StringTest1) {
EXPECT_STREQ("open2", lex.getCurToken()->getStringLiteral()->c_str());
ASSERT_TRUE(lex.isNewLineBeforeCurrentToken());

ASSERT_EQ(TokenKind::eof, lex.advance()->getKind());
ASSERT_FALSE(lex.isNewLineBeforeCurrentToken());
}

TEST(JSLexerTest, StringLineParaSepTest) {
JSLexer::Allocator alloc;
SourceErrorManager sm;
DiagContext diag(sm);

// Test that Unicode line and paragraph separatot are valid in a string
// (since ES10).
JSLexer lex(
"'\xe2\x80\xa8' "
"'\xe2\x80\xa9' "
"'\\\xe2\x80\xa8' "
"'\\\xe2\x80\xa9' ",
sm,
alloc);

ASSERT_EQ(TokenKind::string_literal, lex.advance()->getKind());
ASSERT_EQ(1, diag.getErrCountClear());
EXPECT_STREQ("open3", lex.getCurToken()->getStringLiteral()->c_str());
ASSERT_TRUE(lex.isNewLineBeforeCurrentToken());
ASSERT_EQ(0, diag.getErrCountClear());
EXPECT_STREQ("\xe2\x80\xa8", lex.getCurToken()->getStringLiteral()->c_str());

ASSERT_EQ(TokenKind::string_literal, lex.advance()->getKind());
ASSERT_EQ(0, diag.getErrCountClear());
EXPECT_STREQ("\xe2\x80\xa9", lex.getCurToken()->getStringLiteral()->c_str());

ASSERT_EQ(TokenKind::string_literal, lex.advance()->getKind());
ASSERT_EQ(0, diag.getErrCountClear());
EXPECT_STREQ("", lex.getCurToken()->getStringLiteral()->c_str());

ASSERT_EQ(TokenKind::string_literal, lex.advance()->getKind());
ASSERT_EQ(0, diag.getErrCountClear());
EXPECT_STREQ("", lex.getCurToken()->getStringLiteral()->c_str());

ASSERT_EQ(TokenKind::eof, lex.advance()->getKind());
ASSERT_FALSE(lex.isNewLineBeforeCurrentToken());
}

TEST(JSLexerTest, StringTest2) {
Expand Down

0 comments on commit d304cb5

Please sign in to comment.