@@ -114,6 +114,8 @@ jobs:
114114
115115 build-and-test-llama3 :
116116 runs-on : ubicloud-gpu-standard-1-latest
117+ container :
118+ image : nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
117119 env :
118120 HF_TOKEN : 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
119121 steps :
@@ -122,21 +124,21 @@ jobs:
122124 - run : echo "::add-mask::$HF_TOKEN"
123125
124126 - name : Install OpenMP
125- run : sudo apt-get update && sudo apt-get install -y libomp-dev
127+ run : apt-get update && apt-get install -y libomp-dev libopenmpi-dev python3-pip
126128
127129 - name : Install dependencies
128130 run : pip install -r requirements.txt
129131
130132 - name : Run preprocessing
131- run : python dev/data/tinyshakespeare.py --model_desc llama-3
133+ run : python3 dev/data/tinyshakespeare.py --model_desc llama-3
132134
133135 - name : Train model
134136 # use the first 10 layers, so that everything fits into the 20GB of
135137 # the A4000 Ada that we have in CI
136- run : python train_llama3.py --write_tensors 1 --dtype float32 --depth 10
138+ run : python3 train_llama3.py --write_tensors 1 --dtype float32 --depth 10
137139
138140 - name : Build FP32 precision
139- run : PRECISION=FP32 make test_llama3cu
141+ run : PRECISION=FP32 NO_MULTI_GPU=1 make test_llama3cu
140142
141143 - name : Run default
142144 run : ./test_llama3cu
@@ -148,7 +150,7 @@ jobs:
148150 run : ./test_llama3cu -r 2
149151
150152 - name : Build BF16 precision
151- run : PRECISION=BF16 make train_llama3cu test_llama3cu
153+ run : PRECISION=BF16 NO_MULTI_GPU=1 make train_llama3cu test_llama3cu
152154
153155 - name : Run default
154156 run : ./test_llama3cu
@@ -167,7 +169,7 @@ jobs:
167169 git clone https://github.com/NVIDIA/cudnn-frontend.git
168170
169171 - name : Build with cuDNN
170- run : USE_CUDNN=1 PRECISION=BF16 make train_llama3cu test_llama3cu
172+ run : USE_CUDNN=1 PRECISION=BF16 NO_MULTI_GPU=1 make train_llama3cu test_llama3cu
171173
172174 - name : Train model with cuDNN
173175 run : ./train_llama3cu
0 commit comments