From 53b94634639e673eeac880b005e9e64981259035 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 2 Aug 2024 14:20:26 +0200 Subject: [PATCH] support files with unicode BOM (#138) Files uploaded using BinaryIO can start with a unicode BOM. When dowloaded, these files must be decoded by detecting the BOM. This does not apply to notebooks, because Databricks transforms the uploaded code before storage (pre-pending a header) and ignores the BOM if any (possibly a Databricks bug). As a result, it is not possible to store notebooks with a BOM. --------- Co-authored-by: Eric Vergnaud --- src/databricks/labs/blueprint/paths.py | 19 ++++++++++++++++--- tests/integration/test_paths.py | 21 +++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 48b4e30..c203db6 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -2,6 +2,7 @@ import abc import builtins +import codecs import fnmatch import io import locale @@ -789,19 +790,31 @@ def open( newline: str | None = None, ): """Open a file in Databricks Workspace. Only text and binary modes are supported.""" - if encoding is None or encoding == "locale": - encoding = locale.getpreferredencoding(False) if "b" in mode and "r" in mode: return self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) if "b" in mode and "w" in mode: return _BinaryUploadIO(self._ws, self.as_posix()) if "r" in mode: with self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) as f: - return StringIO(f.read().decode(encoding)) + data = f.read() + if encoding is None: + if data.startswith(codecs.BOM_UTF32_LE) or data.startswith(codecs.BOM_UTF32_BE): + encoding = "utf-32" + elif data.startswith(codecs.BOM_UTF16_LE) or data.startswith(codecs.BOM_UTF16_BE): + encoding = "utf-16" + elif data.startswith(codecs.BOM_UTF8): + encoding = "utf-8-sig" + if encoding is None or encoding == "locale": + encoding = locale.getpreferredencoding(False) + return StringIO(data.decode(encoding)) if "w" in mode: return _TextUploadIO(self._ws, self.as_posix()) raise ValueError(f"invalid mode: {mode}") + def read_text(self, encoding=None, errors=None): + with self.open(mode="r", encoding=encoding, errors=errors) as f: + return f.read() + @property def suffix(self) -> str: """Return the file extension. If the file is a notebook, return the suffix based on the language.""" diff --git a/tests/integration/test_paths.py b/tests/integration/test_paths.py index 0b6b2c3..4691f17 100644 --- a/tests/integration/test_paths.py +++ b/tests/integration/test_paths.py @@ -1,3 +1,4 @@ +import codecs from pathlib import Path import pytest @@ -205,3 +206,23 @@ def test_file_and_notebook_in_same_folder_with_different_suffixes(ws, make_noteb assert files["a.txt"].suffix == ".txt" assert files["b"].suffix == ".py" # suffix is determined from ObjectInfo assert files["b"].read_text() == "# Databricks notebook source\ndisplay(spark.range(10))" + + +@pytest.mark.parametrize( + "bom, encoding", + [ + (codecs.BOM_UTF8, "utf-8"), + (codecs.BOM_UTF16_LE, "utf-16-le"), + (codecs.BOM_UTF16_BE, "utf-16-be"), + (codecs.BOM_UTF32_LE, "utf-32-le"), + (codecs.BOM_UTF32_BE, "utf-32-be"), + ], +) +def test_correctly_encodes_and_decodes_file_with_bom(bom, encoding, ws, make_directory): + # Can't test notebooks because the server changes the uploaded data + folder = WorkspacePath(ws, make_directory()) + file_path = folder / f"some_file_{encoding}.py" + data = bom + "a = 12".encode(encoding) + file_path.write_bytes(data) + text = file_path.read_text() + assert text == "a = 12"