From 1a45bd8e8ca27ce32a7091e64d07a04b2adb2bb5 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:49:37 -0700 Subject: [PATCH] Lock cache file of HF model list (#6628) The error in the following log suggests that the cache file for HF model list can be broken: https://github.com/microsoft/DeepSpeed/actions/runs/11343665365/job/31546708118?pr=6614 The actual cause of the above error is unclear, but `_hf_model_list` potentially breaks the cache file when it is concurrently called from multiple processes. This PR locks the cache file to ensure `_hf_model_list` safely reads and writes the file. --- tests/unit/inference/test_inference.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 581a2ce433ed..9b563523dbeb 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -10,6 +10,7 @@ import os import time import requests +import fcntl from dataclasses import dataclass from typing import List @@ -95,9 +96,12 @@ def _hf_model_list() -> List[ModelInfo]: if os.path.isfile(cache_file_path): with open(cache_file_path, 'rb') as f: try: + fcntl.flock(f, fcntl.LOCK_SH) model_data = pickle.load(f) except Exception as e: print(f"Error loading cache file {cache_file_path}: {e}") + finally: + fcntl.flock(f, fcntl.LOCK_UN) current_time = time.time() @@ -125,7 +129,11 @@ def _hf_model_list() -> List[ModelInfo]: # Save the updated cache os.makedirs(cache_dir, exist_ok=True) with open(cache_file_path, 'wb') as f: - pickle.dump(model_data, f) + try: + fcntl.flock(f, fcntl.LOCK_EX) + pickle.dump(model_data, f) + finally: + fcntl.flock(f, fcntl.LOCK_UN) return model_data["model_list"]