Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Source phase imports #168

Merged
merged 2 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ A JS module syntax lexer used in [es-module-shims](https://github.com/guybedford

Outputs the list of exports and locations of import specifiers, including dynamic import and import meta handling.

Supports new syntax features including import attributes and source phase imports.

A very small single JS file (4KiB gzipped) that includes inlined Web Assembly for very fast source analysis of ECMAScript module syntax only.

For an example of the performance, Angular 1 (720KiB) is fully parsed in 5ms, in comparison to the fastest JS parser, Acorn which takes over 100ms.
Expand All @@ -20,6 +22,8 @@ _Comprehensively handles the JS language grammar while remaining small and fast.
npm install es-module-lexer
```

See [types/lexer.d.ts](types/lexer.d.ts) for the type definitions.

For use in CommonJS:

```js
Expand Down Expand Up @@ -60,6 +64,10 @@ import { init, parse } from 'es-module-lexer';
// Comments provided to demonstrate edge cases
import /*comment!*/ ( 'asdf', { assert: { type: 'json' }});
import /*comment!*/.meta.asdf;

// Source phase imports:
import source mod from './mod.wasm';
import.source('./mod.wasm);
`;

const [imports, exports] = parse(source, 'optional-sourcename');
Expand Down Expand Up @@ -98,10 +106,10 @@ import { init, parse } from 'es-module-lexer';
// Returns -1
exports[2].le;

// Dynamic imports are indicated by imports[2].d > -1
// In this case the "d" index is the start of the dynamic import bracket
// Import type is provided by `t` value
// (1 for static, 2, for dynamic)
// Returns true
imports[2].d > -1;
imports[2].t == 2;

// Returns "asdf" (only for string literal dynamic imports)
imports[2].n
Expand All @@ -128,6 +136,13 @@ import { init, parse } from 'es-module-lexer';
// Returns "import /*comment!*/.meta"
source.slice(imports[4].s, imports[4].e);
// ss and se are the same for import meta

// Returns "'./mod.wasm'"
source.slice(imports[5].s, imports[5].e);

// Import type 4 and 5 for static and dynamic source phase
imports[5].t === 4;
imports[6].t === 5;
})();
```

Expand Down
4 changes: 2 additions & 2 deletions chompfile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ deps = ['src/lexer.h', 'src/lexer.c']
run = """
${{ WASI_PATH }}/bin/clang src/lexer.c --sysroot=${{ WASI_PATH }}/share/wasi-sysroot -o lib/lexer.wasm -nostartfiles \
"-Wl,-z,stack-size=13312,--no-entry,--compress-relocations,--strip-all,\
--export=parse,--export=sa,--export=e,--export=ri,--export=re,--export=is,--export=ie,--export=ss,--export=ip,--export=se,--export=ai,--export=id,--export=es,--export=ee,--export=els,--export=ele,--export=f,--export=ms,--export=__heap_base" \
--export=parse,--export=sa,--export=e,--export=ri,--export=re,--export=is,--export=ie,--export=it,--export=ss,--export=ip,--export=se,--export=ai,--export=id,--export=es,--export=ee,--export=els,--export=ele,--export=f,--export=ms,--export=__heap_base" \
-Wno-logical-op-parentheses -Wno-parentheses \
-Oz
"""
Expand All @@ -110,7 +110,7 @@ run = """
${{ EMSDK_PATH }}/emsdk activate 1.40.1-fastcomp

${{ EMSDK_PATH }}/fastcomp/emscripten/emcc ./src/lexer.c -o lib/lexer.emcc.js -s WASM=0 -Oz --closure 1 \
-s EXPORTED_FUNCTIONS="['_parse','_sa','_e','_ri','_re','_is','_ie','_ss','_ip','_se','_ai','_id','_es','_ee','_els','_ele','_f','_ms','_setSource']" \
-s EXPORTED_FUNCTIONS="['_parse','_sa','_e','_ri','_re','_it','_is','_ie','_ss','_ip','_se','_ai','_id','_es','_ee','_els','_ele','_f','_ms','_setSource']" \
-s ERROR_ON_UNDEFINED_SYMBOLS=0 -s SINGLE_FILE=1 -s TOTAL_STACK=4997968 -s --separate-asm -Wno-logical-op-parentheses -Wno-parentheses

# rm lib/lexer.emcc.js
Expand Down
10 changes: 5 additions & 5 deletions lib/lexer.asm.js

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions lib/lexer.emcc.asm.js

Large diffs are not rendered by default.

Binary file modified lib/lexer.wasm
Binary file not shown.
6 changes: 3 additions & 3 deletions src/lexer.asm.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ const copy = new Uint8Array(new Uint16Array([1]).buffer)[0] === 1 ? function (sr
outBuf16[i++] = (ch & 0xff) << 8 | ch >>> 8;
}
};
const words = 'xportmportlassetaromsyncunctionssertvoyiedelecontininstantybreareturdebuggeawaithrwhileforifcatcfinallels';
const words = 'xportmportlassetaourceromsyncunctionssertvoyiedelecontininstantybreareturdebuggeawaithrwhileforifcatcfinallels';

let source, name;
export function parse (_source, _name = '@') {
Expand Down Expand Up @@ -44,11 +44,11 @@ export function parse (_source, _name = '@') {

const imports = [], exports = [];
while (asm.ri()) {
const s = asm.is(), e = asm.ie(), a = asm.ai(), d = asm.id(), ss = asm.ss(), se = asm.se();
const s = asm.is(), e = asm.ie(), a = asm.ai(), d = asm.id(), ss = asm.ss(), se = asm.se(), t = asm.it();
let n;
if (asm.ip())
n = readString(d === -1 ? s : s + 1, source.charCodeAt(d === -1 ? s - 1 : s));
imports.push({ n, s, e, ss, se, d, a });
imports.push({ t, n, s, e, ss, se, d, a });
}
while (asm.re()) {
const s = asm.es(), e = asm.ee(), ls = asm.els(), le = asm.ele();
Expand Down
224 changes: 120 additions & 104 deletions src/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ static const char16_t BREA[] = { 'b', 'r', 'e', 'a' };
static const char16_t CONTIN[] = { 'c', 'o', 'n', 't', 'i', 'n' };
static const char16_t SYNC[] = {'s', 'y', 'n', 'c'};
static const char16_t UNCTION[] = {'u', 'n', 'c', 't', 'i', 'o', 'n'};
static const char16_t OURCE[] = {'o', 'u', 'r', 'c', 'e'};

// Note: parsing is based on the _assumption_ that the source is already valid
bool parse () {
Expand Down Expand Up @@ -239,124 +240,136 @@ void tryParseImportStatement () {

char16_t ch = commentWhitespace(true);

switch (ch) {
// dynamic import
case '(':
openTokenStack[openTokenDepth].token = ImportParen;
openTokenStack[openTokenDepth++].pos = pos;
if (*lastTokenPos == '.')
return;
// dynamic import indicated by positive d
char16_t* dynamicPos = pos;
// try parse a string, to record a safe dynamic import string
pos++;
ch = commentWhitespace(true);
addImport(startPos, pos, 0, dynamicPos);
dynamicImportStack[dynamicImportStackDepth++] = import_write_head;
if (ch == '\'') {
stringLiteral(ch);
}
else if (ch == '"') {
stringLiteral(ch);
}
else {
pos--;
return;
}
pos++;
char16_t* endPos = pos;
bool source_keyword = false;

if (ch == '.') {
// import.meta
pos++;
ch = commentWhitespace(true);
// import.meta indicated by d == -2
if (ch == 'm' && memcmp(pos + 1, &ETA[0], 3 * 2) == 0 && (isSpread(lastTokenPos) || *lastTokenPos != '.')) {
addImport(startPos, startPos, pos + 4, IMPORT_META);
return;
}
else if (ch == 's' && memcmp(pos + 1, &OURCE[0], 5 * 2) == 0 && (isSpread(lastTokenPos) || *lastTokenPos != '.')) {
source_keyword = true;
pos += 6;
ch = commentWhitespace(true);
if (ch == ',') {
pos++;
ch = commentWhitespace(true);
import_write_head->end = endPos;
import_write_head->assert_index = pos;
import_write_head->safe = true;
pos--;
}
else if (ch == ')') {
openTokenDepth--;
import_write_head->end = endPos;
import_write_head->statement_end = pos + 1;
import_write_head->safe = true;
dynamicImportStackDepth--;
}
else {
pos--;
}
}
else {
return;
// import.meta
case '.':
}
}
else if (pos > startPos + 6 && ch == 's' && memcmp(pos + 1, &OURCE[0], 5 * 2) == 0 && isBrOrWs(*(pos + 6))) {
source_keyword = true;
pos += 6;
ch = commentWhitespace(true);
}

// dynamic import
if (ch == '(') {
openTokenStack[openTokenDepth].token = ImportParen;
openTokenStack[openTokenDepth++].pos = pos;
if (*lastTokenPos == '.')
return;
// dynamic import indicated by positive d
char16_t* dynamicPos = pos;
// try parse a string, to record a safe dynamic import string
pos++;
ch = commentWhitespace(true);
addImport(startPos, pos, 0, dynamicPos);
if (source_keyword)
import_write_head->import_ty = DynamicSourcePhase;
dynamicImportStack[dynamicImportStackDepth++] = import_write_head;
if (ch == '\'') {
stringLiteral(ch);
}
else if (ch == '"') {
stringLiteral(ch);
}
else {
pos--;
return;
}
pos++;
char16_t* endPos = pos;
ch = commentWhitespace(true);
if (ch == ',') {
pos++;
ch = commentWhitespace(true);
// import.meta indicated by d == -2
if (ch == 'm' && memcmp(pos + 1, &ETA[0], 3 * 2) == 0 && (isSpread(lastTokenPos) || *lastTokenPos != '.'))
addImport(startPos, startPos, pos + 4, IMPORT_META);
import_write_head->end = endPos;
import_write_head->assert_index = pos;
import_write_head->safe = true;
pos--;
}
else if (ch == ')') {
openTokenDepth--;
import_write_head->end = endPos;
import_write_head->statement_end = pos + 1;
import_write_head->safe = true;
dynamicImportStackDepth--;
}
else {
pos--;
}
return;
}

if (ch == '{' && !source_keyword) {
// import statement only permitted at base-level
if (openTokenDepth != 0) {
pos--;
return;
}

default:
// no space after "import" -> not an import keyword
if (pos == startPos + 6) {
pos--;
break;
}
case '"':
case '\'':
case '*': {
// import statement only permitted at base-level
if (openTokenDepth != 0) {
pos--;
return;
}
while (pos < end) {
ch = *pos;
if (isQuote(ch)) {
readImportString(startPos, ch);
return;
}
while (pos < end) {
ch = commentWhitespace(true);
if (isQuote(ch)) {
stringLiteral(ch);
} else if (ch == '}') {
pos++;
break;
}
syntaxError();
break;
pos++;
}

case '{': {
// import statement only permitted at base-level
if (openTokenDepth != 0) {
pos--;
return;
}

while (pos < end) {
ch = commentWhitespace(true);
ch = commentWhitespace(true);
if (ch == 'f' && memcmp(pos + 1, &ROM[0], 3 * 2) != 0) {
syntaxError();
return;
}

if (isQuote(ch)) {
stringLiteral(ch);
} else if (ch == '}') {
pos++;
break;
}
pos += 4;
ch = commentWhitespace(true);

pos++;
}
if (!isQuote(ch)) {
return syntaxError();
}

ch = commentWhitespace(true);
if (ch == 'f' && memcmp(pos + 1, &ROM[0], 3 * 2) != 0) {
syntaxError();
break;
readImportString(startPos, ch, false);
}
else {
if (source_keyword || !(ch == '"' || ch == '\'' || ch == '*')) {
// no space after "import" -> not an import keyword
if (pos == startPos + (source_keyword ? 12 : 6)) {
pos--;
return;
}

pos += 4;
ch = commentWhitespace(true);

if (!isQuote(ch)) {
return syntaxError();
}
// import statement only permitted at base-level
if (openTokenDepth != 0 ) {
pos--;
return;
}
while (pos < end) {
ch = *pos;
if (isQuote(ch)) {
readImportString(startPos, ch, source_keyword);
return;
}

readImportString(startPos, ch);

break;
pos++;
}
syntaxError();
}
}

Expand Down Expand Up @@ -572,7 +585,7 @@ void tryParseExportStatement () {
// from ...
if (ch == 'f' && memcmp(pos + 1, &ROM[0], 3 * 2) == 0) {
pos += 4;
readImportString(sStartPos, commentWhitespace(true));
readImportString(sStartPos, commentWhitespace(true), false);

// There were no local names.
for (Export* exprt = prev_export_write_head == NULL ? first_export : prev_export_write_head->next; exprt != NULL; exprt = exprt->next) {
Expand Down Expand Up @@ -619,7 +632,7 @@ char16_t readExportAs (char16_t* startPos, char16_t* endPos) {
return ch;
}

void readImportString (const char16_t* ss, char16_t ch) {
void readImportString (const char16_t* ss, char16_t ch, bool source_phase) {
const char16_t* startPos = pos + 1;
if (ch == '\'') {
stringLiteral(ch);
Expand All @@ -632,6 +645,9 @@ void readImportString (const char16_t* ss, char16_t ch) {
return;
}
addImport(ss, startPos, pos, STANDARD_IMPORT);
if (source_phase) {
import_write_head->import_ty = StaticSourcePhase;
}
pos++;
ch = commentWhitespace(false);
if (!(ch == 'a' && memcmp(pos + 1, &SSERT[0], 5 * 2) == 0) && !(ch == 'w' && *(pos + 1) == 'i' && *(pos + 2) == 't' && *(pos + 3) == 'h')) {
Expand Down
Loading
Loading