From 27e1103216e59fba43e9674f2c5832788c2406fe Mon Sep 17 00:00:00 2001 From: aiuto Date: Wed, 24 Jun 2020 06:58:04 -0700 Subject: [PATCH] Fix ctx.actions.write to write as ISO 8859-1 so it is symetric with BUILD and .bzl input parsing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internally BUILD and .bzl files are read as uninterpreted raw byte strings and not as UTF-8, so we should write content back out the same way. For example, if a BUILD file contains: ``` ... notice = "Copyright © 2019 Acme LLC", ``` Bazel will store the copyright symbol as two distinct octets ([0xc2, 0xa9]). When writing that with an action, the same two octets should be emitted. The behavior fixed by this change is to not interpret each character as a standalone Unicode code point to be encoded as UTF-8. While this change is appropriate for strings read from BUILD files, there is still a discrepancy in handling file paths with non Latin1 characters on Windows file systems. That is described in https://github.com/bazelbuild/bazel/issues/11602. While this change does produce an observable behavior change, I am considering it a bug fix rather than an incompatible change because the previous behavior was both wrong and also (as described in #11602) inconsistent across different OSes. Closes https://github.com/bazelbuild/bazel/issues/10174 RELNOTES: Fix behavior of ctx.actions.write so content is written without an incorrect encoding to UTF-8. See https://github.com/bazelbuild/bazel/issues/10174 for details. PiperOrigin-RevId: 318056290 --- .../lib/analysis/actions/FileWriteAction.java | 20 +++- .../shell/integration/loading_phase_test.sh | 92 +++++++++++++++++++ 2 files changed, 107 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java b/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java index 5cb26b47ee09dd..15aa1e808c7e75 100644 --- a/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java +++ b/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java @@ -14,7 +14,7 @@ package com.google.devtools.build.lib.analysis.actions; -import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.charset.StandardCharsets.ISO_8859_1; import com.google.common.annotations.VisibleForTesting; import com.google.devtools.build.lib.actions.ActionExecutionContext; @@ -39,7 +39,12 @@ /** * Action to write a file whose contents are known at analysis time. * - *

The output is always UTF-8 encoded. + *

The output file is generally encoded as UTF-8, but by an unusual path. BUILD files and + * directory entries, which are actually UTF-8, are misinterpreted by Bazel as Latin1, so that most + * Strings within the build language use this unusual representation. FileWriteAction writes those + * Strings out again as Latin1. + * + *

TODO(b/146554973): Change this implementation when that is addressed. * *

TODO(bazel-team): Choose a better name to distinguish this class from {@link * BinaryFileWriteAction}. @@ -171,7 +176,7 @@ private static final class CompressedString extends LazyString { final int uncompressedSize; CompressedString(String chars) { - byte[] dataToCompress = chars.getBytes(UTF_8); + byte[] dataToCompress = chars.getBytes(ISO_8859_1); ByteArrayOutputStream byteStream = new ByteArrayOutputStream(dataToCompress.length); try (GZIPOutputStream zipStream = new GZIPOutputStream(byteStream)) { zipStream.write(dataToCompress); @@ -202,7 +207,7 @@ public String toString() { // This should be impossible since we're reading from a byte array. throw new RuntimeException(e); } - return new String(uncompressedBytes, UTF_8); + return new String(uncompressedBytes, ISO_8859_1); } } @@ -216,6 +221,11 @@ boolean usesCompression() { * *

Note that if the string is lazily computed or compressed, calling this method will force its * computation or decompression. No attempt is made by FileWriteAction to cache the result. + * + *

Note that the content is a not a normal Java String. When Bazel parses BUILD files, it + * misinterprets the bytes as Latin1, so a code point with a 3-byte UTF-8 encoding will take 3 + * chars internally. To reverse this process, you must encode this string as Latin1, giving you + * back the correct UTF-8 encoding of the original input. */ public String getFileContents() { return fileContents.toString(); @@ -235,7 +245,7 @@ public DeterministicWriter newDeterministicWriter(ActionExecutionContext ctx) { return new DeterministicWriter() { @Override public void writeOutputFile(OutputStream out) throws IOException { - out.write(getFileContents().getBytes(UTF_8)); + out.write(getFileContents().getBytes(ISO_8859_1)); } }; } diff --git a/src/test/shell/integration/loading_phase_test.sh b/src/test/shell/integration/loading_phase_test.sh index f9fd9e7fd34c18..9c33363de4eaf7 100755 --- a/src/test/shell/integration/loading_phase_test.sh +++ b/src/test/shell/integration/loading_phase_test.sh @@ -498,4 +498,96 @@ EOF done } +# Test that actions.write correctly emits a UTF-8 encoded attribute value as +# UTF-8. +function test_actions_write_utf8_attribute() { + local -r pkg="${FUNCNAME}" + mkdir -p "$pkg" || fail "could not create \"$pkg\"" + + cat >"${pkg}/def.bzl" <<'EOF' +def _write_attribute_impl(ctx): + ctx.actions.write( + output = ctx.outputs.out, + # adding a NL at the end to make the diff below easier + content = ctx.attr.text + '\n', + ) + return [] + +write_attribute = rule( + implementation = _write_attribute_impl, + attrs = { + "text": attr.string(), + "out": attr.output(), + }, +) +EOF + + cat >"${pkg}/BUILD" <<'EOF' +load(":def.bzl", "write_attribute") +write_attribute( + name = "text_with_non_latin1_chars", + # U+41, U+2117, U+4E16, U+1F63F (1,2,3,4-byte UTF-8 encodings), 10 bytes. + text = "A©世😿", + out = "out", +) +EOF + bazel build "${pkg}:text_with_non_latin1_chars" || fail "Expected build to succeed" + diff $(bazel info "${PRODUCT_NAME}-bin")/$pkg/out <(echo 'A©世😿') || fail 'diff failed' +} + +# Test that actions.write emits a file name containing non-Latin1 characters as +# a UTF-8 encoded string. +function test_actions_write_not_latin1_path() { + # TODO(https://github.com/bazelbuild/bazel/issues/11602): Enable after that is fixed. + if $is_windows ; then + echo 'Skipping test_actions_write_not_latin1_path on Windows. See #11602' + return + fi + + local -r pkg="${FUNCNAME}" + mkdir -p "$pkg" || fail "could not create \"$pkg\"" + + filename='A©世😿.file' # see above for an explanation. + echo hello >"${pkg}/${filename}" + + cat >"${pkg}/def.bzl" <<'EOF' +def _write_paths_impl(ctx): + # srcs is a list, but we only expect one entry. + if len(ctx.attr.srcs) != 1: + fail('expected exactly 1 file for srcs. got %d' % len(ctx.attr.srcs)) + file_name = ctx.attr.srcs[0].label.name + ctx.actions.write( + output = ctx.outputs.out, + content = file_name, + ) + return [] + +write_paths = rule( + implementation = _write_paths_impl, + attrs = { + "srcs": attr.label_list(allow_files=True), + "out": attr.output(), + }, +) +EOF + + cat >"${pkg}/BUILD" <<'EOF' +load(":def.bzl", "write_paths") +write_paths( + name = "path_with_non_latin1", + # Use a glob to ensure that the value is read from the file system and not + # out of BUILD. + srcs = glob(["*.file"]), + out = "paths.txt", +) +EOF + + bazel build "${pkg}:path_with_non_latin1" >output 2>&1 || ( + echo '== build output' + cat output + fail "Expected build to succeed" + ) + assert_contains "^${filename}$" $(bazel info "${PRODUCT_NAME}-bin")/$pkg/paths.txt +} + run_suite "Integration tests of ${PRODUCT_NAME} using loading/analysis phases."