-
Notifications
You must be signed in to change notification settings - Fork 551
/
Copy pathnemo_gpt_preprocessing.yaml
131 lines (111 loc) · 4.8 KB
/
nemo_gpt_preprocessing.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Prepares the wiki dataset for training with NeMo. Downloads the data, runs
# preprocessing and saves the data in mmap format on a cloud bucket. This same
# bucket can then be used for training.
#
# This YAML is for demonstration purposes and is not a necessary step before
# running nemo_gpt_train.yaml. Since this preprocessing can take
# upto 6 hours, we provide a read-only bucket with the preprocessed data (gs://sky-wiki-data)
# that can be downloaded to your bucket (see nemo_gpt_train.yaml).
#
# Usage:
# sky launch -s -c nemo_gpt_preprocessing nemo_gpt_preprocessing.yaml
#
# # Terminate cluster after you're done
# sky down nemo_gpt_preprocessing
num_nodes: 1
envs:
LOCAL_DATASET_ROOT: /wiki
DATASET_BUCKET_ROOT: /bucket
BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it
file_mounts:
${DATASET_BUCKET_ROOT}:
name: ${BUCKET_NAME}
store: gcs # We recommend using GCS for large datasets in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
mode: MOUNT
setup: |
conda activate nemo
if [ $? -eq 0 ]; then
echo "Nemo conda env exists"
else
echo "Setup start"
conda create -y --name nemo python==3.10.12
conda activate nemo
# Install PyTorch
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# Install nemo
git clone https://github.com/NVIDIA/NeMo.git
cd NeMo
git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab
pip install Cython
pip install .[all]
cd ..
# Install megatron-core
# We install in editable mode because setup.py does not install all
# required modules if we install in non-editable mode.
git clone https://github.com/NVIDIA/Megatron-LM
cd Megatron-LM
git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0
pip install -e .
cd ..
# Install ninja for faster compilation
pip install ninja packaging
# Install transformer engine and flash-attn (Takes ~1hr to compile)
MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine
MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
pip install pytorch-extension
# Install Apex
git clone https://github.com/NVIDIA/apex.git
cd apex
git checkout 52e18c894223800cb611682dce27d88050edf1de
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
cd ..
fi
run: |
conda activate nemo
# ======== Download and preprocess the wikipedia dataset ========
if [ -f ${LOCAL_DATASET_ROOT}/train_data.jsonl ]; then
echo "Dataset exists"
else
# Install axel for faster downloads
sudo apt-get install -y axel
mkdir -p ${LOCAL_DATASET_ROOT}
cd ${LOCAL_DATASET_ROOT}
# Download the wikipedia dataset (takes ~15 min)
axel -n 20 https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
# Preprocess the wikipedia dataset (takes ~2 hours)
pip install wikiextractor
python -m wikiextractor.WikiExtractor enwiki-latest-pages-articles.xml.bz2 --json
find text -name 'wiki_*' -exec cat {} \; > train_data.jsonl
fi
# ======== Download tokenizer files ========
# Check if the tokenizer files exist
if [ -f ${LOCAL_DATASET_ROOT}/gpt2-vocab.json ]; then
echo "Tokenizer files exist"
else
# Download the tokenizer files
cd {LOCAL_DATASET_ROOT}
axel -n 20 https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
axel -n 20 https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
fi
# ======== Convert data to mmap format and write to bucket ========
# Check if the mmap files exist
if [ -f ${LOCAL_DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
echo "Mmap files exist"
else
# Convert the data to mmap format`
cd ${LOCAL_DATASET_ROOT}
python $HOME/sky_workdir/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
--input=train_data.jsonl \
--json-keys=text \
--tokenizer-library=megatron \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--output-prefix=hfbpe_gpt_training_data \
--append-eod \
--workers=32
fi
echo "Done preprocessing dataset, copying to mounted bucket now."
cp {gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_BUCKET_ROOT}
echo "Done copying - data is now available on ${BUCKET_NAME} bucket."