diff --git a/run_llama_train.sh b/run_llama_train.sh index 2d154964..13b66aea 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -2,6 +2,9 @@ set -ex +# libUV is a scalable backend for TCPStore which is used in processGroup +# rendezvous. This is the recommended backend for distributed training. +export USE_LIBUV=1 TRAINER_DIR=${1:-/home/$USER/local/torchtrain} # use envs as local overrides for convenience