22
22
delete :
23
23
24
24
jobs :
25
- prepare_cache :
26
- name : Prepare testmon cache
27
- if : |
28
- github.event_name == 'create' &&
29
- github.event.ref_type == 'branch' &&
30
- github.event.repository.full_name == 'hpcaitech/ColossalAI'
31
- runs-on : [self-hosted, gpu]
32
- container :
33
- image : hpcaitech/pytorch-cuda:1.12.0-11.3.0
34
- options : --rm
35
- timeout-minutes : 5
36
- defaults :
37
- run :
38
- shell : bash
39
- steps :
40
- - name : Copy testmon cache
41
- run : | # branch name may contain slash, we need to replace it with space
42
- export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
43
- if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
44
- cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
45
- fi
46
- env :
47
- MAIN_BRANCH : ${{ github.event.master_branch }}
48
-
49
- prepare_cache_for_pr :
50
- name : Prepare testmon cache for PR
51
- if : |
52
- github.event_name == 'pull_request' &&
53
- (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
54
- github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
55
- runs-on : [self-hosted, gpu]
56
- container :
57
- image : hpcaitech/pytorch-cuda:1.12.0-11.3.0
58
- options : --rm
59
- timeout-minutes : 5
60
- defaults :
61
- run :
62
- shell : bash
63
- concurrency :
64
- group : ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
65
- cancel-in-progress : true
66
- steps :
67
- - name : Copy testmon cache
68
- run : | # branch name may contain slash, we need to replace it with space
69
- export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
70
- if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
71
- mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
72
- fi
73
- env :
74
- PR_NUMBER : ${{ github.event.number }}
75
-
76
25
detect :
77
26
name : Detect file change
78
27
if : |
140
89
if : needs.detect.outputs.anyLibraryFileChanged == 'true'
141
90
runs-on : [self-hosted, gpu]
142
91
container :
143
- image : hpcaitech/pytorch-cuda:1.12.0-11.3 .0
144
- options : --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
92
+ image : hpcaitech/pytorch-cuda:2.1.0-12.1 .0
93
+ options : --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
145
94
timeout-minutes : 60
146
95
defaults :
147
96
run :
@@ -168,12 +117,13 @@ jobs:
168
117
cd TensorNVMe
169
118
conda install cmake
170
119
pip install -r requirements.txt
171
- pip install -v .
120
+ DISABLE_URING=1 pip install -v .
172
121
173
122
- name : Store TensorNVMe Cache
174
123
run : |
175
124
cd TensorNVMe
176
125
cp -p -r ./build /github/home/tensornvme_cache/
126
+ cp -p -r ./cmake-build /github/home/tensornvme_cache/
177
127
178
128
- name : Checkout Colossal-AI
179
129
uses : actions/checkout@v2
@@ -190,39 +140,32 @@ jobs:
190
140
191
141
- name : Install Colossal-AI
192
142
run : |
193
- CUDA_EXT =1 pip install -v -e .
143
+ BUILD_EXT =1 pip install -v -e .
194
144
pip install -r requirements/requirements-test.txt
195
145
196
146
- name : Store Colossal-AI Cache
197
147
run : |
198
148
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
199
149
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
200
150
201
- - name : Restore Testmon Cache
202
- run : |
203
- if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
204
- cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
205
- fi
206
- env :
207
- PR_NUMBER : ${{ github.event.number }}
208
-
209
151
- name : Execute Unit Testing
210
152
run : |
211
- CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
153
+ CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
154
+ -m "not largedist" \
155
+ --durations=0 \
156
+ --ignore tests/test_analyzer \
157
+ --ignore tests/test_auto_parallel \
158
+ --ignore tests/test_fx \
159
+ --ignore tests/test_autochunk \
160
+ --ignore tests/test_gptq \
161
+ --ignore tests/test_infer_ops \
162
+ --ignore tests/test_legacy \
163
+ --ignore tests/test_smoothquant \
164
+ tests/
212
165
env :
213
- DATA : /data/scratch/cifar-10
214
- NCCL_SHM_DISABLE : 1
215
166
LD_LIBRARY_PATH : /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
216
- TESTMON_CORE_PKGS : /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
217
167
LLAMA_PATH : /data/scratch/llama-tiny
218
168
219
- - name : Store Testmon Cache
220
- run : |
221
- mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
222
- cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
223
- env :
224
- PR_NUMBER : ${{ github.event.number }}
225
-
226
169
- name : Collate artifact
227
170
env :
228
171
PR_NUMBER : ${{ github.event.number }}
@@ -259,54 +202,3 @@ jobs:
259
202
with :
260
203
name : report
261
204
path : report/
262
-
263
- store_cache :
264
- name : Store testmon cache for PR
265
- if : |
266
- github.event_name == 'pull_request' &&
267
- github.event.action == 'closed' &&
268
- github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
269
- runs-on : [self-hosted, gpu]
270
- container :
271
- image : hpcaitech/pytorch-cuda:1.12.0-11.3.0
272
- options : --rm
273
- timeout-minutes : 5
274
- defaults :
275
- run :
276
- shell : bash
277
- steps :
278
- - name : Store testmon cache if possible
279
- if : github.event.pull_request.merged == true
280
- run : | # branch name may contain slash, we need to replace it with space
281
- export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
282
- if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
283
- cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
284
- fi
285
- env :
286
- PR_NUMBER : ${{ github.event.pull_request.number }}
287
-
288
- - name : Remove testmon cache
289
- run : |
290
- rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
291
- env :
292
- PR_NUMBER : ${{ github.event.pull_request.number }}
293
-
294
- remove_cache :
295
- name : Remove testmon cache
296
- if : |
297
- github.event_name == 'delete' &&
298
- github.event.ref_type == 'branch' &&
299
- github.event.repository.full_name == 'hpcaitech/ColossalAI'
300
- runs-on : [self-hosted, gpu]
301
- container :
302
- image : hpcaitech/pytorch-cuda:1.12.0-11.3.0
303
- options : --rm
304
- timeout-minutes : 5
305
- defaults :
306
- run :
307
- shell : bash
308
- steps :
309
- - name : Remove testmon cache
310
- run : | # branch name may contain slash, we need to replace it with space
311
- export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
312
- rm -rf "/github/home/testmon_cache/${BASE}"
0 commit comments