test 12.6.3

ngc92 · ngc92 · commit 8a1893e26ad9 · 2025-05-02T23:23:57.000+02:00
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -114,6 +114,8 @@ jobs:
 
   build-and-test-llama3:
     runs-on: ubicloud-gpu-standard-1-latest
+    container:
+      image: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
     env:
       HF_TOKEN: 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
     steps:
@@ -122,19 +124,19 @@ jobs:
       - run: echo "::add-mask::$HF_TOKEN"
 
       - name: Install OpenMP
-        run: sudo apt-get update && sudo apt-get install -y libomp-dev
+        run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev python3-pip
 
       - name: Install dependencies
         run: pip install -r requirements.txt
 
       - name: Run preprocessing
-        run: python dev/data/tinyshakespeare.py --model_desc llama-3
+        run: python3 dev/data/tinyshakespeare.py --model_desc llama-3
 
       - name: Train model
-        run: python train_llama3.py --write_tensors 1 --dtype float32 --offload 1
+        run: python3 train_llama3.py --write_tensors 1 --dtype float32 --offload 1
 
       - name: Build FP32 precision
-        run: PRECISION=FP32 make test_llama3cu
+        run: PRECISION=FP32 NO_MULTI_GPU=1 make test_llama3cu
 
       - name: Run default
         run: ./test_llama3cu
@@ -146,7 +148,7 @@ jobs:
         run: ./test_llama3cu -r 2
 
       - name: Build BF16 precision
-        run: PRECISION=BF16 make train_llama3cu test_llama3cu
+        run: PRECISION=BF16 NO_MULTI_GPU=1 make train_llama3cu test_llama3cu
 
       - name: Run default
         run: ./test_llama3cu
@@ -165,7 +167,7 @@ jobs:
           git clone https://github.com/NVIDIA/cudnn-frontend.git
 
       - name: Build with cuDNN
-        run: USE_CUDNN=1 PRECISION=BF16 make train_llama3cu test_llama3cu
+        run: USE_CUDNN=1 PRECISION=BF16 NO_MULTI_GPU=1 make train_llama3cu test_llama3cu
 
       - name: Train model with cuDNN
         run: ./train_llama3cu
diff --git a/Makefile b/Makefile
@@ -122,7 +122,7 @@ ifeq ($(USE_CUDNN), 1)
       $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
     endif
     NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
-    NVCC_LDFLAGS += -lcudnn
+    NVCC_LDFLAGS += -lcudnn -L$(CUDNN_LIB_DIR)
     NVCC_FLAGS += -DENABLE_CUDNN
     NVCC_CUDNN = $(BUILD_DIR)/cudnn_att.o
   else