From 778c5d089a8ad66e6bedb52b3ed672051ef065fe Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sun, 10 Dec 2023 05:07:06 -0500 Subject: [PATCH 1/2] fix GPU mapping error for Horovod + finetune When doing finetune with Horovod, the same error as https://github.com/deepmodeling/deepmd-kit/issues/2712 throws at what I modified in this PR. Signed-off-by: Jinzhe Zeng --- deepmd/utils/batch_size.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 2b3117d849..a93f2018f9 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -7,8 +7,12 @@ ) import numpy as np +from packaging.version import ( + Version, +) from deepmd.env import ( + TF_VERSION, tf, ) from deepmd.utils.errors import ( @@ -59,7 +63,7 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: self.minimal_not_working_batch_size = self.maximum_working_batch_size + 1 else: self.maximum_working_batch_size = initial_batch_size - if tf.test.is_gpu_available(): + if (Version(TF_VERSION) >= Version("1.14") and tf.config.experimental.get_visible_devices('GPU')) or tf.test.is_gpu_available(): self.minimal_not_working_batch_size = 2**31 else: self.minimal_not_working_batch_size = ( From 6b0919768fa10d3db3ce39728064b2834914dc8d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 10 Dec 2023 10:08:40 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/batch_size.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index a93f2018f9..fe876a65a5 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -63,7 +63,10 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: self.minimal_not_working_batch_size = self.maximum_working_batch_size + 1 else: self.maximum_working_batch_size = initial_batch_size - if (Version(TF_VERSION) >= Version("1.14") and tf.config.experimental.get_visible_devices('GPU')) or tf.test.is_gpu_available(): + if ( + Version(TF_VERSION) >= Version("1.14") + and tf.config.experimental.get_visible_devices("GPU") + ) or tf.test.is_gpu_available(): self.minimal_not_working_batch_size = 2**31 else: self.minimal_not_working_batch_size = (