From 53b94634639e673eeac880b005e9e64981259035 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Fri, 2 Aug 2024 14:20:26 +0200
Subject: [PATCH] support files with unicode BOM (#138)

Files uploaded using BinaryIO can start with a unicode BOM.
When dowloaded, these files must be decoded by detecting the BOM.
This does not apply to notebooks, because Databricks transforms the
uploaded code before storage (pre-pending a header) and ignores the BOM
if any (possibly a Databricks bug). As a result, it is not possible to
store notebooks with a BOM.

---------

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
---
 src/databricks/labs/blueprint/paths.py | 19 ++++++++++++++++---
 tests/integration/test_paths.py        | 21 +++++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py
index 48b4e30..c203db6 100644
--- a/src/databricks/labs/blueprint/paths.py
+++ b/src/databricks/labs/blueprint/paths.py
@@ -2,6 +2,7 @@
 
 import abc
 import builtins
+import codecs
 import fnmatch
 import io
 import locale
@@ -789,19 +790,31 @@ def open(
         newline: str | None = None,
     ):
         """Open a file in Databricks Workspace. Only text and binary modes are supported."""
-        if encoding is None or encoding == "locale":
-            encoding = locale.getpreferredencoding(False)
         if "b" in mode and "r" in mode:
             return self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO)
         if "b" in mode and "w" in mode:
             return _BinaryUploadIO(self._ws, self.as_posix())
         if "r" in mode:
             with self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) as f:
-                return StringIO(f.read().decode(encoding))
+                data = f.read()
+                if encoding is None:
+                    if data.startswith(codecs.BOM_UTF32_LE) or data.startswith(codecs.BOM_UTF32_BE):
+                        encoding = "utf-32"
+                    elif data.startswith(codecs.BOM_UTF16_LE) or data.startswith(codecs.BOM_UTF16_BE):
+                        encoding = "utf-16"
+                    elif data.startswith(codecs.BOM_UTF8):
+                        encoding = "utf-8-sig"
+                if encoding is None or encoding == "locale":
+                    encoding = locale.getpreferredencoding(False)
+                return StringIO(data.decode(encoding))
         if "w" in mode:
             return _TextUploadIO(self._ws, self.as_posix())
         raise ValueError(f"invalid mode: {mode}")
 
+    def read_text(self, encoding=None, errors=None):
+        with self.open(mode="r", encoding=encoding, errors=errors) as f:
+            return f.read()
+
     @property
     def suffix(self) -> str:
         """Return the file extension. If the file is a notebook, return the suffix based on the language."""
diff --git a/tests/integration/test_paths.py b/tests/integration/test_paths.py
index 0b6b2c3..4691f17 100644
--- a/tests/integration/test_paths.py
+++ b/tests/integration/test_paths.py
@@ -1,3 +1,4 @@
+import codecs
 from pathlib import Path
 
 import pytest
@@ -205,3 +206,23 @@ def test_file_and_notebook_in_same_folder_with_different_suffixes(ws, make_noteb
     assert files["a.txt"].suffix == ".txt"
     assert files["b"].suffix == ".py"  # suffix is determined from ObjectInfo
     assert files["b"].read_text() == "# Databricks notebook source\ndisplay(spark.range(10))"
+
+
+@pytest.mark.parametrize(
+    "bom, encoding",
+    [
+        (codecs.BOM_UTF8, "utf-8"),
+        (codecs.BOM_UTF16_LE, "utf-16-le"),
+        (codecs.BOM_UTF16_BE, "utf-16-be"),
+        (codecs.BOM_UTF32_LE, "utf-32-le"),
+        (codecs.BOM_UTF32_BE, "utf-32-be"),
+    ],
+)
+def test_correctly_encodes_and_decodes_file_with_bom(bom, encoding, ws, make_directory):
+    # Can't test notebooks because the server changes the uploaded data
+    folder = WorkspacePath(ws, make_directory())
+    file_path = folder / f"some_file_{encoding}.py"
+    data = bom + "a = 12".encode(encoding)
+    file_path.write_bytes(data)
+    text = file_path.read_text()
+    assert text == "a = 12"