From e3281e78a923af29e44a3c787d5bd6112c73e56e Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 13 Jan 2025 16:20:15 +0100 Subject: [PATCH 1/6] Change `du` command to use --block-size instead of --bytes --- src/aiida/orm/nodes/data/remote/base.py | 2 +- tests/orm/nodes/data/test_remote.py | 29 ++++++++++++------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index 5b7fec86c2..d74a711e71 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -285,7 +285,7 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: """ try: - retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}') + retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}') except NotImplementedError as exc: raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc diff --git a/tests/orm/nodes/data/test_remote.py b/tests/orm/nodes/data/test_remote.py index ad12d97273..52848be019 100644 --- a/tests/orm/nodes/data/test_remote.py +++ b/tests/orm/nodes/data/test_remote.py @@ -55,8 +55,8 @@ def test_clean(remote_data_factory, mode): @pytest.mark.parametrize( 'setup, results', ( - (('du', False), ('4.01 KB', 'du')), - (('du', True), (4108, 'du')), + (('du', False), ('8.00 KB', 'du')), + (('du', True), (8192, 'du')), (('stat', False), ('12.00 B', 'stat')), (('stat', True), (12, 'stat')), ), @@ -74,13 +74,12 @@ def test_get_size_on_disk_params(remote_data_factory, mode, setup, results): @pytest.mark.parametrize( 'content, sizes', ( - (b'a', {'du': 4097, 'stat': 1, 'human': '4.00 KB'}), - (10 * b'a', {'du': 4106, 'stat': 10, 'human': '4.01 KB'}), - (100 * b'a', {'du': 4196, 'stat': 100, 'human': '4.10 KB'}), - (1000 * b'a', {'du': 5096, 'stat': 1000, 'human': '4.98 KB'}), - (1000000 * b'a', {'du': 1004096, 'stat': int(1e6), 'human': '980.56 KB'}), + (b'a', {'du': 8192, 'stat': 1, 'human': '8.00 KB'}), + (10 * b'a', {'du': 8192, 'stat': 10, 'human': '8.00 KB'}), + (1000 * b'a', {'du': 8192, 'stat': 1000, 'human': '8.00 KB'}), + (1000000 * b'a', {'du': 1007616, 'stat': int(1e6), 'human': '984.00 KB'}), ), - ids=['1-byte', '10-bytes', '100-bytes', '1000-bytes', '1e6-bytes'], + ids=['1-byte', '10-bytes', '1000-bytes', '1e6-bytes'], ) def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes): """Test the different implementations to obtain the size of a ``RemoteData`` on disk.""" @@ -103,12 +102,12 @@ def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes): @pytest.mark.parametrize( 'num_char, relpath, sizes', ( - (1, '.', {'du': 12291, 'stat': 8195, 'human': '12.00 KB'}), - (100, '.', {'du': 12588, 'stat': 8492, 'human': '12.29 KB'}), - (int(1e6), '.', {'du': 3012288, 'stat': 3008192, 'human': '2.87 MB'}), - (1, 'subdir1', {'du': 8194, 'stat': 4098, 'human': '8.00 KB'}), - (100, 'subdir1', {'du': 8392, 'stat': 4296, 'human': '8.20 KB'}), - (int(1e6), 'subdir1', {'du': 2008192, 'stat': 2004096, 'human': '1.92 MB'}), + (1, '.', {'du': 24576, 'stat': 8195, 'human': '24.00 KB'}), + (100, '.', {'du': 24576, 'stat': 8492, 'human': '24.00 KB'}), + (int(1e6), '.', {'du': 3022848, 'stat': 3008192, 'human': '2.88 MB'}), + (1, 'subdir1', {'du': 16384, 'stat': 4098, 'human': '16.00 KB'}), + (100, 'subdir1', {'du': 16384, 'stat': 4296, 'human': '16.00 KB'}), + (int(1e6), 'subdir1', {'du': 2015232, 'stat': 2004096, 'human': '1.92 MB'}), ), ) def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, sizes): @@ -173,7 +172,7 @@ def test_get_size_on_disk_du(remote_data_factory, mode, monkeypatch): with authinfo.get_transport() as transport: size_on_disk = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path) - assert size_on_disk == 4108 + assert size_on_disk == 8192 # Monkeypatch transport exec_command_wait command to simulate it not being implemented, e.g., for FirecREST plugin def mock_exec_command_wait(command): From fcd9f73143357e8fd64a0a115e3bfed61df2ac0f Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 13 Jan 2025 16:52:13 +0100 Subject: [PATCH 2/6] Adapt runners to Ubuntu-24.04 check if `du` fixed. --- .github/workflows/ci-code.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-code.yml b/.github/workflows/ci-code.yml index 7ff4a9d864..2281c39f91 100644 --- a/.github/workflows/ci-code.yml +++ b/.github/workflows/ci-code.yml @@ -20,7 +20,7 @@ jobs: tests: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 timeout-minutes: 45 strategy: @@ -93,7 +93,7 @@ jobs: tests-presto: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 timeout-minutes: 20 steps: @@ -118,7 +118,7 @@ jobs: verdi: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 timeout-minutes: 10 steps: From 204d7c3381061c52212cbf7cabaf81205e9a0e18 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 13 Jan 2025 18:16:40 +0100 Subject: [PATCH 3/6] Add notes to `du` command and revert ci-code --- .github/workflows/ci-code.yml | 6 +++--- src/aiida/orm/nodes/data/remote/base.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-code.yml b/.github/workflows/ci-code.yml index 2281c39f91..7ff4a9d864 100644 --- a/.github/workflows/ci-code.yml +++ b/.github/workflows/ci-code.yml @@ -20,7 +20,7 @@ jobs: tests: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest timeout-minutes: 45 strategy: @@ -93,7 +93,7 @@ jobs: tests-presto: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest timeout-minutes: 20 steps: @@ -118,7 +118,7 @@ jobs: verdi: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest timeout-minutes: 10 steps: diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index d74a711e71..f043343126 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -285,6 +285,12 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: """ try: + # Initially, we were using the `--bytes` option here. However, this is equivalent to `--apparent-size + # --block-size=1`, with the `apparent-size` option being rather fragile (e.g. its implementation changed + # between Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we retain only `--block-size=1` to + # obtain the total size in bytes. Still, block sizes might be different for different disk formattings, OSs, + # etc. However, as 4092 bytes is the default for Linux and macOS, it should be fine to expect it in the + # corresponding tests. Lastly, the `-s` (`--summarize`) option yields only a single total file size value. retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}') except NotImplementedError as exc: raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc From d5ae096babc20bc697977ec65d8644938922883c Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 13 Jan 2025 18:20:26 +0100 Subject: [PATCH 4/6] Fix typo in comment --- src/aiida/orm/nodes/data/remote/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index f043343126..fce1f7e302 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -289,7 +289,7 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: # --block-size=1`, with the `apparent-size` option being rather fragile (e.g. its implementation changed # between Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we retain only `--block-size=1` to # obtain the total size in bytes. Still, block sizes might be different for different disk formattings, OSs, - # etc. However, as 4092 bytes is the default for Linux and macOS, it should be fine to expect it in the + # etc. However, as 4096 bytes is the default for Linux and macOS, it should be fine to expect it in the # corresponding tests. Lastly, the `-s` (`--summarize`) option yields only a single total file size value. retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}') except NotImplementedError as exc: From 17241285dfbd5d768ccabf18e47acf5517ff9b59 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 13 Jan 2025 19:04:44 +0100 Subject: [PATCH 5/6] Extend comments on `du` usage --- src/aiida/orm/nodes/data/remote/base.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index fce1f7e302..95cd1b31b3 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -286,11 +286,12 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: try: # Initially, we were using the `--bytes` option here. However, this is equivalent to `--apparent-size - # --block-size=1`, with the `apparent-size` option being rather fragile (e.g. its implementation changed - # between Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we retain only `--block-size=1` to - # obtain the total size in bytes. Still, block sizes might be different for different disk formattings, OSs, - # etc. However, as 4096 bytes is the default for Linux and macOS, it should be fine to expect it in the - # corresponding tests. Lastly, the `-s` (`--summarize`) option yields only a single total file size value. + # --block-size=1`, with the `--apparent-size` option being rather fragile (e.g. its implementation changed + # between the different version of `du` of Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we + # retain only `--block-size=1` to obtain the total size in bytes. Still, block sizes might be different for + # different disk formattings, OSs, etc. However, as 4096 bytes is the default for Linux and macOS, it should + # be fine to expect it in the corresponding tests. Lastly, the `-s` (`--summarize`) option yields only a + # single total file size value. retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}') except NotImplementedError as exc: raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc From cf9b884adfe5176ebb4f912825a16130e9396e5b Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 13 Jan 2025 21:32:41 +0100 Subject: [PATCH 6/6] Correct notes on usage of `du`. --- src/aiida/orm/nodes/data/remote/base.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index 95cd1b31b3..97f5f067b8 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -285,13 +285,20 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: """ try: - # Initially, we were using the `--bytes` option here. However, this is equivalent to `--apparent-size - # --block-size=1`, with the `--apparent-size` option being rather fragile (e.g. its implementation changed - # between the different version of `du` of Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we - # retain only `--block-size=1` to obtain the total size in bytes. Still, block sizes might be different for - # different disk formattings, OSs, etc. However, as 4096 bytes is the default for Linux and macOS, it should - # be fine to expect it in the corresponding tests. Lastly, the `-s` (`--summarize`) option yields only a - # single total file size value. + # Initially, we were using the `--bytes` option here. However, this is equivalent to `--apparent-size` and + # `--block-size=1`, with the `--apparent-size` option being rather fragile (e.g., its implementation changed + # between the different versions of `du` of Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we + # retain only `--block-size=1` to obtain the total size in **bytes** as an integer value. Note that + # `--block-size` here refers to the size of "blocks" for displaying the output of `du`, by default 1024 + # bytes (1 KiB), **not** the "disk block size". + + # On the other hand, for the parametrized tests of this functionality in `test_remote.py`, a **disk block + # size** of 4096 bytes (4 KiB) is assumed, meaning that a file with, e.g., only 1 or 10 bytes of actual + # content (its `apparent-size`) still occupies 4096 bytes (4 KiB) on disk. In principle, the disk block size + # can differ between different disk formattings, OS, etc., however, as 4096 bytes (4 KiB) is the default for + # Linux's ext4, as well as macOS, we assume for now that it is a reasonable assumption and default value. + + # Lastly, the `-s` (`--summarize`) option yields only a single total file size value. retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}') except NotImplementedError as exc: raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc