[BUG] cannot find -lcurand and -lcudart #3929

GasolSun36 · 2023-07-12T02:54:27Z

Describe the bug
The error is
FAILED: cpu_adam.so c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/home/xuchengjin/anaconda3/envs/test/lib64 -lcudart -o cpu_adam.so /usr/bin/ld: cannot find -lcurand /usr/bin/ld: cannot find -lcudart
When I see in the /torch/lib, there's really no lib_curand.so and lib_cudart.io in there.
Is this normal? Or is there something wrong with my cuda installation? Can I copy this two files that someone else already has into my directory?

To Reproduce
Steps to reproduce the behavior:
I'm running stanford_alpaca train.py, and using

torchrun --nproc_per_node=4 --master_port=<your_random_port> train.py
--model_name_or_path <your_path_to_hf_converted_llama_ckpt_and_tokenizer>
--data_path ./alpaca_data.json
--bf16 True
--output_dir <your_output_dir>
--num_train_epochs 3
--per_device_train_batch_size 4
--per_device_eval_batch_size 4
--gradient_accumulation_steps 8
--evaluation_strategy "no"
--save_strategy "steps"
--save_steps 2000
--save_total_limit 1
--learning_rate 2e-5
--weight_decay 0.
--warmup_ratio 0.03
--deepspeed "./configs/default_offload_opt_param.json"
--tf32 True

to start the training.
The "default_offload_opt_param.json" is:

{
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": false
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 5,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}

Expected behavior

[1/3] /home/xuchengjin/anaconda3/envs/test/bin/nvcc -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="gcc" -DPYBIND11_STDLIB="libstdcpp" -DPYBIND11_BUILD_ABI="cxxabi1011" -I/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/xuchengjin/anaconda3/envs/test/include -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include/TH -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include/THC -isystem /home/xuchengjin/anaconda3/envs/test/include -isystem /home/xuchengjin/anaconda3/envs/test/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS -D__CUDA_NO_HALF_CONVERSIONS_ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -c /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o
[2/3] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="gcc" -DPYBIND11_STDLIB="libstdcpp" -DPYBIND11_BUILD_ABI="cxxabi1011" -I/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/xuchengjin/anaconda3/envs/test/include -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include/TH -isystem /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/include/THC -isystem /home/xuchengjin/anaconda3/envs/test/include -isystem /home/xuchengjin/anaconda3/envs/test/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++17 -g -Wno-reorder -L/home/xuchengjin/anaconda3/envs/test/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX512 -D__ENABLE_CUDA_ -DBF16_AVAILABLE -c /home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o
[3/3] c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/home/xuchengjin/anaconda3/envs/test/lib64 -lcudart -o cpu_adam.so
FAILED: cpu_adam.so
c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/home/xuchengjin/anaconda3/envs/test/lib64 -lcudart -o cpu_adam.so
/usr/bin/ld: cannot find -lcurand
/usr/bin/ld: cannot find -lcudart
collect2: error: ld returned 1 exit status
ninja: build stopped: subcommand failed.
Traceback (most recent call last):
File "/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1900, in _run_ninja_build
subprocess.run(
File "/home/xuchengjin/anaconda3/envs/test/lib/python3.9/subprocess.py", line 528, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.

ds_report output

[2023-07-12 10:49:56,649] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)

DeepSpeed C++/CUDA extension op report

NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.

JIT compiled ops requires ninja
ninja .................. [OKAY]

op name ................ installed .. compatible

async_io ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]

DeepSpeed general environment info:
torch install path ............... ['/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/torch']
torch version .................... 1.13.1
deepspeed install path ........... ['/home/xuchengjin/anaconda3/envs/test/lib/python3.9/site-packages/deepspeed']
deepspeed info ................... 0.10.0+55243f3b, 55243f3, master
torch cuda version ............... 11.7
torch hip version ................ None
nvcc version ..................... 11.7
deepspeed wheel compiled w. ...... torch 1.13, cuda 11.7

System info (please complete the following information):

OS: [e.g. Ubuntu 20.04]
single machine with x8 A100-40G
Python-3.9.17
deepspeed-0.10.0
accelerate 0.20.3
ninja 1.11.1
cudatoolkit 11.7

Launcher context
Are you launching your experiment with the deepspeed launcher, MPI, or something else?

Additional context
Add any other context about the problem here.

The text was updated successfully, but these errors were encountered:

loadams · 2023-07-12T21:57:30Z

Hi @GasolSun36 - if you don't see either of the cuda.so files in your cuda install path, my guess is that something went wrong on the cuda install. Unfortunately, there is more to these than just the .so files, so you probably want to re-install cuda and ensure it installs correctly with both curand and cudart.

Can you try re-installing cuda and let us know if that fixes your issue?

loadams · 2023-07-17T16:14:13Z

Hi @GasolSun36 - were you able to test the re-install of cuda?

syuoni · 2023-07-20T08:53:23Z

Hi @loadams ,

I have the same issue. When I run

python -c 'import deepspeed; deepspeed.ops.adam.cpu_adam.CPUAdamBuilder().load()'

I get

[2023-07-20 08:43:48,536] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Using /home/enwei/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/enwei/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/1] c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/home/enwei/anaconda3/envs/llama/lib64 -lcudart -o cpu_adam.so
FAILED: cpu_adam.so
c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/home/enwei/anaconda3/envs/llama/lib64 -lcudart -o cpu_adam.so
/usr/bin/ld: cannot find -lcurand: No such file or directory
/usr/bin/ld: cannot find -lcudart: No such file or directory
collect2: error: ld returned 1 exit status
ninja: build stopped: subcommand failed.
Traceback (most recent call last):
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1893, in _run_ninja_build
    subprocess.run(
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/subprocess.py", line 528, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 454, in load
    return self.jit_load(verbose)
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 497, in jit_load
    op_module = load(name=self.name,
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
    return _jit_compile(
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1509, in _jit_compile
    _write_ninja_file_and_build_library(
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1624, in _write_ninja_file_and_build_library
    _run_ninja_build(
  File "/home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1909, in _run_ninja_build
    raise RuntimeError(message) from e
RuntimeError: Error building extension 'cpu_adam'

I checked the path /home/enwei/anaconda3/envs/llama/lib/python3.9/site-packages/torch/lib, and there are no liblcurand.so and liblcudart.so. These files are found in /home/enwei/anaconda3/envs/llama/lib.

I am not sure what is the problem with my cuda installation. Can you help me? Thanks.

Best

syuoni · 2023-07-20T09:10:38Z

I find that there is no lib64 under /home/enwei/anaconda3/envs/llama. So I copied everything in lib to lib64, and the problem is solved for me.

GasolSun36 · 2023-07-20T12:12:17Z

I find that there is no lib64 under /home/enwei/anaconda3/envs/llama. So I copied everything in lib to lib64, and the problem is solved for me.

Thanks a lot! That sounds like a solution. However, I created a new environment and found that the reason for this was that I was using conda to download version 1.13.1 of pytorch, and after using pip to download it, the issue was resolved.

maximegmd · 2023-08-30T08:13:44Z

Could we re-open this? I have the same issue on SageMaker, copying the DLLs doesn't seem like a fix but a workaround. Couldn't we include the lib folder as well in the linker phase?

GasolSun36 · 2023-08-30T08:59:32Z

Could we re-open this? I have the same issue on SageMaker, copying the DLLs doesn't seem like a fix but a workaround. Couldn't we include the lib folder as well in the linker phase?

maybe pip install pytorch could solve your problem, works for me.

maximegmd · 2023-08-30T11:09:50Z

It's not related to pip, it's just that conda installs in the lib folder and not lib64 on SageMaker, I made a PR to fix this issue.

Temporary solution for anyone reading this, set the environment variables

LIBRARY_PATH = /opt/conda/lib/
LD_LIBRARY_PATH = /opt/conda/lib/

loadams · 2023-08-30T17:38:18Z

Hi @maximegmd - it would be interesting to know why SageMaker/the OS is using the lib folder and not lib64, but thanks for the PR, we will get that merged, so no need to re-open this issue, right?

* fix: linker issues in conda environments #3929 * ignore: re-ordering * Update builder.py

pacman100 · 2023-12-13T18:39:58Z

Could we re-open this? I have the same issue on SageMaker, copying the DLLs doesn't seem like a fix but a workaround. Couldn't we include the lib folder as well in the linker phase?

maybe pip install pytorch could solve your problem, works for me.

Thank you @GasolSun36, this finally solved the issue!

loadams · 2023-12-14T21:30:21Z

Hey @pacman100 - do you need any other support on this issue or are things working now?

AceMcAwesome77 · 2024-06-14T18:10:37Z

I find that there is no lib64 under /home/enwei/anaconda3/envs/llama. So I copied everything in lib to lib64, and the problem is solved for me.

This worked for me too. I actually just made a symbolic link for that libcurand file, same idea. Like this:

cd /home/asdf/.local/lib/python3.10/site-packages/torch/lib
ln -s /usr/local/cuda/lib64/libcurand.so .

isruihu · 2024-06-19T08:34:18Z

I find that there is no lib64 under /home/enwei/anaconda3/envs/llama. So I copied everything in lib to lib64, and the problem is solved for me.

This worked for me too. I actually just made a symbolic link for that libcurand file, same idea. Like this:
cd /home/asdf/.local/lib/python3.10/site-packages/torch/lib
ln -s /usr/local/cuda/lib64/libcurand.so .

It woked for me, thanks.

AaronZLT · 2024-07-03T12:38:25Z

but why it uses lib instead of lib64?

ParthaEth · 2024-08-20T12:45:15Z

Look at the command that was run right before the error occurred. I am running deep speed using hugging face accelerate framework. for me it loks like the follwoing

c++ cpu_adam.o cpu_adam_impl.o -shared -lcurand -L/home/pghosh/miniconda3/envs/alpha_geo/lib/python3.11/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_pyt
hon -o cpu_adam.so                                                                                                                                                   
/usr/bin/ld: cannot find -lcurand: No such file or directory

So clearly the linking directory is
/home/pghosh/miniconda3/envs/alpha_geo/lib/python3.11/site-packages/torch/lib

therefore if you symlink the library into this directory the error should go away. So I find it in my cuda installation and then
ln -s /usr/local/cuda/lib64/libcurand.so .
libcurand.so - seems to be the correct library!

GasolSun36 added bug Something isn't working training labels Jul 12, 2023

loadams self-assigned this Jul 12, 2023

GasolSun36 closed this as completed Jul 20, 2023

maximegmd added a commit to maximegmd/DeepSpeed that referenced this issue Aug 30, 2023

fix: linker issues in conda environments microsoft#3929

a4950c3

loadams linked a pull request Aug 30, 2023 that will close this issue

fix: linker issues in conda environments #3929 #4235

Merged

github-merge-queue bot pushed a commit that referenced this issue Aug 30, 2023

fix: linker issues in conda environments #3929 (#4235)

194b004

* fix: linker issues in conda environments #3929 * ignore: re-ordering * Update builder.py

fly-dragon211 mentioned this issue Jul 9, 2024

RuntimeError: Error building extension 'cpu_adam', because /usr/bin/ld: can not find -lcurand，help! #5659

Closed

Willmish mentioned this issue Jul 10, 2024

DeepSpeed integration for <10B models on strangeporks. Adamliu1/SNLP_GCW#113

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] cannot find -lcurand and -lcudart #3929

[BUG] cannot find -lcurand and -lcudart #3929

GasolSun36 commented Jul 12, 2023 •

edited

Loading

[2023-07-12 10:49:56,649] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)

DeepSpeed C++/CUDA extension op report

NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.

JIT compiled ops requires ninja
ninja .................. [OKAY]

op name ................ installed .. compatible

loadams commented Jul 12, 2023

loadams commented Jul 17, 2023

syuoni commented Jul 20, 2023

syuoni commented Jul 20, 2023

GasolSun36 commented Jul 20, 2023

maximegmd commented Aug 30, 2023

GasolSun36 commented Aug 30, 2023

maximegmd commented Aug 30, 2023 •

edited

Loading

loadams commented Aug 30, 2023

pacman100 commented Dec 13, 2023

loadams commented Dec 14, 2023

AceMcAwesome77 commented Jun 14, 2024 •

edited

Loading

isruihu commented Jun 19, 2024

AaronZLT commented Jul 3, 2024

ParthaEth commented Aug 20, 2024

[BUG] cannot find -lcurand and -lcudart #3929

[BUG] cannot find -lcurand and -lcudart #3929

Comments

GasolSun36 commented Jul 12, 2023 • edited Loading

[2023-07-12 10:49:56,649] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)

DeepSpeed C++/CUDA extension op report

NOTE: Ops not installed will be just-in-time (JIT) compiled at runtime if needed. Op compatibility means that your system meet the required dependencies to JIT install the op.

JIT compiled ops requires ninja ninja .................. [OKAY]

op name ................ installed .. compatible

loadams commented Jul 12, 2023

loadams commented Jul 17, 2023

syuoni commented Jul 20, 2023

syuoni commented Jul 20, 2023

GasolSun36 commented Jul 20, 2023

maximegmd commented Aug 30, 2023

GasolSun36 commented Aug 30, 2023

maximegmd commented Aug 30, 2023 • edited Loading

loadams commented Aug 30, 2023

pacman100 commented Dec 13, 2023

loadams commented Dec 14, 2023

AceMcAwesome77 commented Jun 14, 2024 • edited Loading

isruihu commented Jun 19, 2024

AaronZLT commented Jul 3, 2024

ParthaEth commented Aug 20, 2024

GasolSun36 commented Jul 12, 2023 •

edited

Loading

NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.

JIT compiled ops requires ninja
ninja .................. [OKAY]

maximegmd commented Aug 30, 2023 •

edited

Loading

AceMcAwesome77 commented Jun 14, 2024 •

edited

Loading