diff --git a/.compatibility b/.compatibility index 4f808740bc02..62d19faffa9e 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1,4 @@ 2.1.0-12.1.0 2.2.2-12.1.0 2.3.0-12.1.0 +2.4.0-12.4.1 diff --git a/.cuda_ext.json b/.cuda_ext.json index 8c9d5916ccd8..1e617755b01b 100644 --- a/.cuda_ext.json +++ b/.cuda_ext.json @@ -5,8 +5,8 @@ "cuda_image": "hpcaitech/cuda-conda:12.1" }, { - "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118", - "cuda_image": "hpcaitech/cuda-conda:11.8" + "torch_command": "pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124", + "cuda_image": "hpcaitech/cuda-conda:12.4" } ] } diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 151454239afe..58cd8826809a 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -141,7 +141,7 @@ jobs: - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v -e . - pip install -r requirements/requirements-test.txt + pip install --no-cache-dir -r requirements/requirements-test.txt - name: Store Colossal-AI Cache run: | diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index fc6424503fbc..fc688a71bd92 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -57,7 +57,7 @@ jobs: [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ BUILD_EXT=1 pip install -v -e . cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ - pip install -r requirements/requirements-test.txt + pip install --no-cache-dir -r requirements/requirements-test.txt - name: Unit Testing if: steps.check-avai.outputs.avai == 'true' diff --git a/colossalai/testing/utils.py b/colossalai/testing/utils.py index 5f6864ff0059..90d35dc851bd 100644 --- a/colossalai/testing/utils.py +++ b/colossalai/testing/utils.py @@ -176,7 +176,7 @@ def test_something(): else: exception = Exception - func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*Address already in use.*") + func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*(A|a)ddress already in use.*") return func_wrapper diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 651eb66e89ab..578122d47072 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -8,7 +8,7 @@ click fabric contexttimer ninja -torch>=2.1.0,<=2.3.0 +torch>=2.1.0,<=2.4.0 safetensors einops pydantic diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py index f92b5c6e5675..2a3b6e5a3a29 100644 --- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py @@ -47,7 +47,7 @@ def check_torch_ddp_plugin(): registry = model_zoo for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): - if name == "dlrm_interactionarch" or name.startswith("simple_"): + if name in ("dlrm_interactionarch", "transformers_mixtral") or name.startswith("simple_"): continue run_fn(model_fn, data_gen_fn, output_transform_fn) torch.cuda.empty_cache() diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py index c85860a8d253..0a919955f26a 100644 --- a/tests/test_lazy/test_models.py +++ b/tests/test_lazy/test_models.py @@ -18,9 +18,17 @@ def test_models_lazy_init(subset, default_device): sub_model_zoo = model_zoo.get_sub_registry(subset, allow_empty=True) for name, entry in sub_model_zoo.items(): # TODO(ver217): lazy init does not support weight norm, skip these models - if name in ("torchaudio_wav2vec2_base", "torchaudio_hubert_base") or name.startswith( - ("transformers_vit", "transformers_blip2", "transformers_whisper") - ): + if name in ( + "torchaudio_wav2vec2_base", + "torchaudio_hubert_base", + "timm_beit", + "timm_vision_transformer", + "timm_deit", + "timm_beitv2", + "timm_deit3", + "timm_convit", + "timm_tnt_b_patch16_224", + ) or name.startswith(("transformers_vit", "transformers_blip2", "transformers_whisper")): continue check_lazy_init(entry, verbose=True, default_device=default_device)