Skip to content

Commit

Permalink
Lock cache file of HF model list (#6628)
Browse files Browse the repository at this point in the history
The error in the following log suggests that the cache file for HF model
list can be broken:

https://github.com/microsoft/DeepSpeed/actions/runs/11343665365/job/31546708118?pr=6614

The actual cause of the above error is unclear, but `_hf_model_list`
potentially breaks the cache file when it is concurrently called from
multiple processes. This PR locks the cache file to ensure
`_hf_model_list` safely reads and writes the file.
  • Loading branch information
tohtana authored Oct 15, 2024
1 parent ce468c3 commit 1a45bd8
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion tests/unit/inference/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import time
import requests
import fcntl

from dataclasses import dataclass
from typing import List
Expand Down Expand Up @@ -95,9 +96,12 @@ def _hf_model_list() -> List[ModelInfo]:
if os.path.isfile(cache_file_path):
with open(cache_file_path, 'rb') as f:
try:
fcntl.flock(f, fcntl.LOCK_SH)
model_data = pickle.load(f)
except Exception as e:
print(f"Error loading cache file {cache_file_path}: {e}")
finally:
fcntl.flock(f, fcntl.LOCK_UN)

current_time = time.time()

Expand Down Expand Up @@ -125,7 +129,11 @@ def _hf_model_list() -> List[ModelInfo]:
# Save the updated cache
os.makedirs(cache_dir, exist_ok=True)
with open(cache_file_path, 'wb') as f:
pickle.dump(model_data, f)
try:
fcntl.flock(f, fcntl.LOCK_EX)
pickle.dump(model_data, f)
finally:
fcntl.flock(f, fcntl.LOCK_UN)

return model_data["model_list"]

Expand Down

0 comments on commit 1a45bd8

Please sign in to comment.