From 27e1103216e59fba43e9674f2c5832788c2406fe Mon Sep 17 00:00:00 2001
From: aiuto <aiuto@google.com>
Date: Wed, 24 Jun 2020 06:58:04 -0700
Subject: [PATCH] Fix ctx.actions.write to write as ISO 8859-1 so it is
 symetric with BUILD and .bzl  input parsing.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Internally BUILD and .bzl files are read as uninterpreted raw byte strings and
not as UTF-8, so we should write content back out the same way.  For example,
if a BUILD file contains:

```
...
notice = "Copyright © 2019 Acme LLC",
```

Bazel will store the copyright symbol as two distinct octets ([0xc2, 0xa9]).
When writing that with an action, the same two octets should be emitted.
The behavior fixed by this change is to not interpret each character as
a standalone Unicode code point to be encoded as UTF-8.

While this change is appropriate for strings read from BUILD files, there is still a discrepancy in handling file paths with non Latin1 characters on Windows file systems.  That is described in https://github.com/bazelbuild/bazel/issues/11602.

While this change does produce an observable behavior change, I am considering it a bug fix rather than an incompatible change because the previous behavior was both wrong and also (as described in #11602) inconsistent across different OSes.

Closes https://github.com/bazelbuild/bazel/issues/10174

RELNOTES:
Fix behavior of ctx.actions.write so content is written without an incorrect encoding to UTF-8.
See https://github.com/bazelbuild/bazel/issues/10174 for details.
PiperOrigin-RevId: 318056290
---
 .../lib/analysis/actions/FileWriteAction.java | 20 +++-
 .../shell/integration/loading_phase_test.sh   | 92 +++++++++++++++++++
 2 files changed, 107 insertions(+), 5 deletions(-)
diff --git a/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java b/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java
index 5cb26b47ee09dd..15aa1e808c7e75 100644
--- a/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java
+++ b/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java
@@ -14,7 +14,7 @@
 
 package com.google.devtools.build.lib.analysis.actions;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.devtools.build.lib.actions.ActionExecutionContext;
@@ -39,7 +39,12 @@
 /**
  * Action to write a file whose contents are known at analysis time.
  *
- * <p>The output is always UTF-8 encoded.
+ * <p>The output file is generally encoded as UTF-8, but by an unusual path. BUILD files and
+ * directory entries, which are actually UTF-8, are misinterpreted by Bazel as Latin1, so that most
+ * Strings within the build language use this unusual representation. FileWriteAction writes those
+ * Strings out again as Latin1.
+ *
+ * <p>TODO(b/146554973): Change this implementation when that is addressed.
  *
  * <p>TODO(bazel-team): Choose a better name to distinguish this class from {@link
  * BinaryFileWriteAction}.
@@ -171,7 +176,7 @@ private static final class CompressedString extends LazyString {
     final int uncompressedSize;
 
     CompressedString(String chars) {
-      byte[] dataToCompress = chars.getBytes(UTF_8);
+      byte[] dataToCompress = chars.getBytes(ISO_8859_1);
       ByteArrayOutputStream byteStream = new ByteArrayOutputStream(dataToCompress.length);
       try (GZIPOutputStream zipStream = new GZIPOutputStream(byteStream)) {
         zipStream.write(dataToCompress);
@@ -202,7 +207,7 @@ public String toString() {
         // This should be impossible since we're reading from a byte array.
         throw new RuntimeException(e);
       }
-      return new String(uncompressedBytes, UTF_8);
+      return new String(uncompressedBytes, ISO_8859_1);
     }
   }
 
@@ -216,6 +221,11 @@ boolean usesCompression() {
    *
    * <p>Note that if the string is lazily computed or compressed, calling this method will force its
    * computation or decompression. No attempt is made by FileWriteAction to cache the result.
+   *
+   * <p>Note that the content is a not a normal Java String. When Bazel parses BUILD files, it
+   * misinterprets the bytes as Latin1, so a code point with a 3-byte UTF-8 encoding will take 3
+   * chars internally. To reverse this process, you must encode this string as Latin1, giving you
+   * back the correct UTF-8 encoding of the original input.
    */
   public String getFileContents() {
     return fileContents.toString();
@@ -235,7 +245,7 @@ public DeterministicWriter newDeterministicWriter(ActionExecutionContext ctx) {
     return new DeterministicWriter() {
       @Override
       public void writeOutputFile(OutputStream out) throws IOException {
-        out.write(getFileContents().getBytes(UTF_8));
+        out.write(getFileContents().getBytes(ISO_8859_1));
       }
     };
   }
diff --git a/src/test/shell/integration/loading_phase_test.sh b/src/test/shell/integration/loading_phase_test.sh
index f9fd9e7fd34c18..9c33363de4eaf7 100755
--- a/src/test/shell/integration/loading_phase_test.sh
+++ b/src/test/shell/integration/loading_phase_test.sh
@@ -498,4 +498,96 @@ EOF
   done
 }
 
+# Test that actions.write correctly emits a UTF-8 encoded attribute value as
+# UTF-8.
+function test_actions_write_utf8_attribute() {
+  local -r pkg="${FUNCNAME}"
+  mkdir -p "$pkg" || fail "could not create \"$pkg\""
+
+  cat >"${pkg}/def.bzl" <<'EOF'
+def _write_attribute_impl(ctx):
+    ctx.actions.write(
+        output = ctx.outputs.out,
+        # adding a NL at the end to make the diff below easier
+        content = ctx.attr.text + '\n',
+    )
+    return []
+
+write_attribute = rule(
+    implementation = _write_attribute_impl,
+    attrs = {
+        "text": attr.string(),
+        "out": attr.output(),
+    },
+)
+EOF
+
+  cat >"${pkg}/BUILD" <<'EOF'
+load(":def.bzl", "write_attribute")
+write_attribute(
+    name = "text_with_non_latin1_chars",
+    # U+41, U+2117, U+4E16, U+1F63F  (1,2,3,4-byte UTF-8 encodings), 10 bytes.
+    text = "A©世😿",
+    out = "out",
+)
+EOF
+  bazel build "${pkg}:text_with_non_latin1_chars" || fail "Expected build to succeed"
+  diff $(bazel info "${PRODUCT_NAME}-bin")/$pkg/out <(echo 'A©世😿') || fail 'diff failed'
+}
+
+# Test that actions.write emits a file name containing non-Latin1 characters as
+# a UTF-8 encoded string.
+function test_actions_write_not_latin1_path() {
+  # TODO(https://github.com/bazelbuild/bazel/issues/11602): Enable after that is fixed.
+  if $is_windows ; then
+    echo 'Skipping test_actions_write_not_latin1_path on Windows. See #11602'
+    return
+  fi
+
+  local -r pkg="${FUNCNAME}"
+  mkdir -p "$pkg" || fail "could not create \"$pkg\""
+
+  filename='A©世😿.file'  # see above for an explanation.
+  echo hello >"${pkg}/${filename}"
+
+  cat >"${pkg}/def.bzl" <<'EOF'
+def _write_paths_impl(ctx):
+    # srcs is a list, but we only expect one entry.
+    if len(ctx.attr.srcs) != 1:
+        fail('expected exactly 1 file for srcs. got %d' % len(ctx.attr.srcs))
+    file_name = ctx.attr.srcs[0].label.name
+    ctx.actions.write(
+        output = ctx.outputs.out,
+        content = file_name,
+    )
+    return []
+
+write_paths = rule(
+    implementation = _write_paths_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files=True),
+        "out": attr.output(),
+    },
+)
+EOF
+
+  cat >"${pkg}/BUILD" <<'EOF'
+load(":def.bzl", "write_paths")
+write_paths(
+    name = "path_with_non_latin1",
+    # Use a glob to ensure that the value is read from the file system and not
+    # out of BUILD.
+    srcs = glob(["*.file"]),
+    out = "paths.txt",
+)
+EOF
+
+  bazel build "${pkg}:path_with_non_latin1" >output 2>&1 || (
+    echo '== build output'
+    cat output
+    fail "Expected build to succeed"
+  )
+  assert_contains "^${filename}$" $(bazel info "${PRODUCT_NAME}-bin")/$pkg/paths.txt
+}
+
 run_suite "Integration tests of ${PRODUCT_NAME} using loading/analysis phases."