diff --git a/node.gyp b/node.gyp index e3457ae6c55b9d..bb5684f28571e1 100644 --- a/node.gyp +++ b/node.gyp @@ -73,6 +73,7 @@ 'src/connection_wrap.cc', 'src/dataqueue/queue.cc', 'src/debug_utils.cc', + 'src/embedded_data.cc', 'src/encoding_binding.cc', 'src/env.cc', 'src/fs_event_wrap.cc', @@ -194,6 +195,7 @@ 'src/dataqueue/queue.h', 'src/debug_utils.h', 'src/debug_utils-inl.h', + 'src/embeded_data.h', 'src/encoding_binding.h', 'src/env_properties.h', 'src/env.h', @@ -1216,11 +1218,14 @@ 'deps/simdutf/simdutf.gyp:simdutf#host', ], 'include_dirs': [ - 'tools' + 'tools', + 'src', ], 'sources': [ 'tools/js2c.cc', - 'tools/executable_wrapper.h' + 'tools/executable_wrapper.h', + 'src/embedded_data.h', + 'src/embedded_data.cc', ], 'conditions': [ [ 'node_shared_libuv=="false"', { diff --git a/src/embedded_data.cc b/src/embedded_data.cc new file mode 100644 index 00000000000000..b5c4d28d8b400d --- /dev/null +++ b/src/embedded_data.cc @@ -0,0 +1,33 @@ +#include "embedded_data.h" +#include + +namespace node { +std::string ToOctalString(const uint8_t ch) { + // We can print most printable characters directly. The exceptions are '\' + // (escape characters), " (would end the string), and ? (trigraphs). The + // latter may be overly conservative: we compile with C++17 which doesn't + // support trigraphs. + if (ch >= ' ' && ch <= '~' && ch != '\\' && ch != '"' && ch != '?') { + return std::string(1, static_cast(ch)); + } + // All other characters are blindly output as octal. + const char c0 = '0' + ((ch >> 6) & 7); + const char c1 = '0' + ((ch >> 3) & 7); + const char c2 = '0' + (ch & 7); + return std::string("\\") + c0 + c1 + c2; +} + +std::vector GetOctalTable() { + size_t size = 1 << 8; + std::vector code_table(size); + for (size_t i = 0; i < size; ++i) { + code_table[i] = ToOctalString(static_cast(i)); + } + return code_table; +} + +const std::string& GetOctalCode(uint8_t index) { + static std::vector table = GetOctalTable(); + return table[index]; +} +} // namespace node diff --git a/src/embedded_data.h b/src/embedded_data.h new file mode 100644 index 00000000000000..84cd5b76dca108 --- /dev/null +++ b/src/embedded_data.h @@ -0,0 +1,17 @@ +#ifndef SRC_EMBEDDED_DATA_H_ +#define SRC_EMBEDDED_DATA_H_ + +#include +#include + +// This file must not depend on node.h or other code that depends on +// the full Node.js implementation because it is used during the +// compilation of the Node.js implementation itself (especially js2c). + +namespace node { + +const std::string& GetOctalCode(uint8_t index); + +} // namespace node + +#endif // SRC_EMBEDDED_DATA_H_ diff --git a/src/node_snapshotable.cc b/src/node_snapshotable.cc index c59a4cdccb9c8a..10d646feabd464 100644 --- a/src/node_snapshotable.cc +++ b/src/node_snapshotable.cc @@ -8,6 +8,7 @@ #include "base_object-inl.h" #include "blob_serializer_deserializer-inl.h" #include "debug_utils-inl.h" +#include "embedded_data.h" #include "encoding_binding.h" #include "env-inl.h" #include "json_parser.h" @@ -748,35 +749,6 @@ static std::string FormatSize(size_t size) { return buf; } -std::string ToOctalString(const uint8_t ch) { - // We can print most printable characters directly. The exceptions are '\' - // (escape characters), " (would end the string), and ? (trigraphs). The - // latter may be overly conservative: we compile with C++17 which doesn't - // support trigraphs. - if (ch >= ' ' && ch <= '~' && ch != '\\' && ch != '"' && ch != '?') { - return std::string(1, static_cast(ch)); - } - // All other characters are blindly output as octal. - const char c0 = '0' + ((ch >> 6) & 7); - const char c1 = '0' + ((ch >> 3) & 7); - const char c2 = '0' + (ch & 7); - return std::string("\\") + c0 + c1 + c2; -} - -std::vector GetOctalTable() { - size_t size = 1 << 8; - std::vector code_table(size); - for (size_t i = 0; i < size; ++i) { - code_table[i] = ToOctalString(static_cast(i)); - } - return code_table; -} - -const std::string& GetOctalCode(uint8_t index) { - static std::vector table = GetOctalTable(); - return table[index]; -} - template void WriteByteVectorLiteral(std::ostream* ss, const T* vec, diff --git a/tools/js2c.cc b/tools/js2c.cc index 904fb6fa44d4f5..1d1ce31d6339e7 100644 --- a/tools/js2c.cc +++ b/tools/js2c.cc @@ -11,6 +11,7 @@ #include #include #include +#include "embedded_data.h" #include "executable_wrapper.h" #include "simdutf.h" #include "uv.h" @@ -396,11 +397,14 @@ const std::string& GetCode(uint16_t index) { #ifdef NODE_JS2C_USE_STRING_LITERALS const char* string_literal_def_template = "static const %s *%s_raw = "; +constexpr std::string_view latin1_string_literal_start = + "reinterpret_cast(\""; constexpr std::string_view ascii_string_literal_start = "reinterpret_cast(R\"JS2C1b732aee("; constexpr std::string_view utf16_string_literal_start = "reinterpret_cast(uR\"JS2C1b732aee("; -constexpr std::string_view string_literal_end = ")JS2C1b732aee\");"; +constexpr std::string_view latin1_string_literal_end = "\");"; +constexpr std::string_view utf_string_literal_end = ")JS2C1b732aee\");"; #else const char* array_literal_def_template = "static const %s %s_raw[] = "; constexpr std::string_view array_literal_start = "{\n"; @@ -424,9 +428,15 @@ constexpr std::string_view array_literal_end = "\n};\n\n"; // If NODE_JS2C_USE_STRING_LITERALS is defined, the data is output as C++ // raw strings (i.e. R"JS2C1b732aee(...)JS2C1b732aee") rather than as an // array. This speeds up compilation for gcc/clang. +enum class CodeType { + kAscii, // Code points are all within 0-127 + kLatin1, // Code points are all within 0-255 + kTwoByte, +}; template Fragment GetDefinitionImpl(const std::vector& code, - const std::string& var) { + const std::string& var, + CodeType type) { constexpr bool is_two_byte = std::is_same_v; static_assert(is_two_byte || std::is_same_v); @@ -440,11 +450,14 @@ Fragment GetDefinitionImpl(const std::vector& code, #ifdef NODE_JS2C_USE_STRING_LITERALS const char* literal_def_template = string_literal_def_template; - size_t def_size = 512 + code.size(); + // For code that contains Latin-1 characters, be conservative and assume + // they all need escaping: one "\" and three digits. + size_t unit = type == CodeType::kLatin1 ? 4 : 1; + size_t def_size = 512 + code.size() * unit; #else const char* literal_def_template = array_literal_def_template; constexpr size_t unit = - (is_two_byte ? 5 : 3) + 1; // 0-65536 or 0-127 and a "," + (is_two_byte ? 5 : 3) + 1; // 0-65536 or 0-255 and a "," size_t def_size = 512 + count * unit; #endif @@ -456,16 +469,56 @@ Fragment GetDefinitionImpl(const std::vector& code, assert(cur != 0); #ifdef NODE_JS2C_USE_STRING_LITERALS - constexpr std::string_view start_string_view = - is_two_byte ? utf16_string_literal_start : ascii_string_literal_start; + std::string_view start_string_view; + switch (type) { + case CodeType::kAscii: + start_string_view = ascii_string_literal_start; + break; + case CodeType::kLatin1: + start_string_view = latin1_string_literal_start; + break; + case CodeType::kTwoByte: + start_string_view = utf16_string_literal_start; + break; + } memcpy( result.data() + cur, start_string_view.data(), start_string_view.size()); cur += start_string_view.size(); - memcpy(result.data() + cur, code.data(), code.size()); - cur += code.size(); + if (type != CodeType::kLatin1) { + memcpy(result.data() + cur, code.data(), code.size()); + cur += code.size(); + } else { + const uint8_t* ptr = reinterpret_cast(code.data()); + for (size_t i = 0; i < count; ++i) { + // Avoid using snprintf on large chunks of data because it's much slower. + // It's fine to use it on small amount of data though. + uint8_t ch = ptr[i]; + if (ch > 127) { + Debug("In %s, found non-ASCII Latin-1 character at %zu: %d\n", + var.c_str(), + i, + ch); + } + const std::string& str = GetOctalCode(ch); + memcpy(result.data() + cur, str.c_str(), str.size()); + cur += str.size(); + } + } + std::string_view string_literal_end; + switch (type) { + case CodeType::kAscii: + string_literal_end = utf_string_literal_end; + break; + case CodeType::kLatin1: + string_literal_end = latin1_string_literal_end; + break; + case CodeType::kTwoByte: + string_literal_end = utf_string_literal_end; + break; + } memcpy(result.data() + cur, string_literal_end.data(), string_literal_end.size()); @@ -476,10 +529,10 @@ Fragment GetDefinitionImpl(const std::vector& code, array_literal_start.size()); cur += array_literal_start.size(); - const std::vector* codepoints; - - std::vector utf16_codepoints; + // Avoid using snprintf on large chunks of data because it's much slower. + // It's fine to use it on small amount of data though. if constexpr (is_two_byte) { + std::vector utf16_codepoints; utf16_codepoints.resize(count); size_t utf16_count = simdutf::convert_utf8_to_utf16( code.data(), @@ -488,19 +541,25 @@ Fragment GetDefinitionImpl(const std::vector& code, assert(utf16_count != 0); utf16_codepoints.resize(utf16_count); Debug("static size %zu\n", utf16_count); - codepoints = &utf16_codepoints; + for (size_t i = 0; i < utf16_count; ++i) { + const std::string& str = GetCode(utf16_codepoints[i]); + memcpy(result.data() + cur, str.c_str(), str.size()); + cur += str.size(); + } } else { - // The code is ASCII, so no need to translate. - codepoints = &code; - } - - for (size_t i = 0; i < codepoints->size(); ++i) { - // Avoid using snprintf on large chunks of data because it's much slower. - // It's fine to use it on small amount of data though. - const std::string& str = GetCode(static_cast((*codepoints)[i])); - - memcpy(result.data() + cur, str.c_str(), str.size()); - cur += str.size(); + const uint8_t* ptr = reinterpret_cast(code.data()); + for (size_t i = 0; i < count; ++i) { + uint16_t ch = static_cast(ptr[i]); + if (ch > 127) { + Debug("In %s, found non-ASCII Latin-1 character at %zu: %d\n", + var.c_str(), + i, + ch); + } + const std::string& str = GetCode(ch); + memcpy(result.data() + cur, str.c_str(), str.size()); + cur += str.size(); + } } memcpy( @@ -520,17 +579,80 @@ Fragment GetDefinitionImpl(const std::vector& code, return result; } -Fragment GetDefinition(const std::string& var, const std::vector& code) { - Debug("GetDefinition %s, code size %zu ", var.c_str(), code.size()); - bool is_one_byte = simdutf::validate_ascii(code.data(), code.size()); - Debug("with %s\n", is_one_byte ? "1-byte chars" : "2-byte chars"); +bool Simplify(const std::vector& code, + const std::string& var, + std::vector* simplified) { + // Allowlist files to avoid false positives. + // TODO(joyeecheung): this could be removed if undici updates itself + // to replace "’" with "'" though we could still keep this skeleton in + // place for future hot fixes that are verified by humans. + if (var != "internal_deps_undici_undici") { + return false; + } - if (is_one_byte) { - Debug("static size %zu\n", code.size()); - return GetDefinitionImpl(code, var); - } else { - return GetDefinitionImpl(code, var); + size_t code_size = code.size(); + simplified->reserve(code_size); + const uint8_t* ptr = reinterpret_cast(code.data()); + size_t simplified_count = 0; + for (size_t i = 0; i < code_size; ++i) { + switch (ptr[i]) { + case 226: { // ’ [ 226, 128, 153 ] -> ' + if (i + 2 < code_size && ptr[i + 1] == 128 && ptr[i + 2] == 153) { + simplified->push_back('\''); + i += 2; + simplified_count++; + break; + } + } + default: { + simplified->push_back(code[i]); + break; + } + } } + + if (simplified_count > 0) { + Debug("Simplified %d characters, ", simplified_count); + Debug("old size %d, new size %d\n", code_size, simplified->size()); + return true; + } + return false; +} + +Fragment GetDefinition(const std::string& var, const std::vector& code) { + Debug("GetDefinition %s, code size %zu\n", var.c_str(), code.size()); + bool is_ascii = simdutf::validate_ascii(code.data(), code.size()); + + if (is_ascii) { + Debug("ASCII-only, static size %zu\n", code.size()); + return GetDefinitionImpl(code, var, CodeType::kAscii); + } + + std::vector latin1(code.size()); + auto result = simdutf::convert_utf8_to_latin1_with_errors( + code.data(), code.size(), latin1.data()); + if (!result.error) { + latin1.resize(result.count); + Debug("Latin-1-only, old size %zu, new size %zu\n", + code.size(), + latin1.size()); + return GetDefinitionImpl(latin1, var, CodeType::kLatin1); + } + + // Since V8 only supports Latin-1 and UTF16 as underlying representation + // we have to encode all files containing two-byte characters as UTF16. + // While some files do need two-byte characters, some just + // unintentionally have them. Replace certain characters that are known + // to have sane one-byte equivalent to save space. + std::vector simplified; + if (Simplify(code, var, &simplified)) { // Changed. + Debug("%s is simplified, re-generate definition\n", var.c_str()); + return GetDefinition(var, simplified); + } + + // Simplification did not turn the code into 1-byte string. Just + // use the original. + return GetDefinitionImpl(code, var, CodeType::kTwoByte); } int AddModule(const std::string& filename,