build: refine the build process, make normal workflow code works. (#90)

kzjeef · web-flow · commit 125cf6398247 · 2025-07-28T12:07:15.000Z
* Update build-check-share-runner.yml * ci: adjust code clone for build test. * thridparty: reduce mkl size, remove unused so. * xformers: replace link with source. * cutlass: try use local tar rather than git submodule. * ci: remove shared runner * Update README.md * mm: add qwen vl2.5 model support. (#86) - add qwen vl 2.5 model support. - Qwen VL2.5 only support 'transformers' as vit engine, (trt not support yet.) - upgrade package version to make sure VL2.5 code is added. test command: server: `dashinfer_vlm_serve --model qwen/Qwen2.5-VL-3B-Instruct --vision_engine transformers --port 8000 --host=127.0.0.1` client: ``` curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d \ '{"model": "qwen/Qwen2.5-VL-3B-Instruct", "messages": [{"role": "user", "content": [{ "type": "text", "text": "Describe the image." }, {"type": "image_url", "image_url": {"url": "https://farm4.staticflickr.com/3075/3168662394_7d7103de7d_z_d.jpg"}}]}], "max_completion_tokens": 1024, "top_p": 0.5, "temperature": 0.1, "frequency_penalty": 1.05 }' ``` result: ``` {"id":"chatcmpl-rxqDiCQEJweEeeB7FADiER","object":"chat.completion", "created":1747992522,"model":"model","choices":[{"index":0,"message":{"role":"assistant","content":"The image features a small hummingbird perched on a branch. The bird is positioned in the center of the scene, with its vibrant colors and delicate features clearly visible. The hummingbird appears to be enjoying its time in nature, possibly searching for food or simply resting on the branch. \n\nThere are no other birds or animals present in the image, making it a solitary moment captured in this natural setting."},"finish_reason":"stop"}],"usage":{"prompt_tokens":382,"total_tokens":95,"completion_tokens":81}} ``` * Update build-check.yml * Update build-check.yml * ci: fix release script. (#88) * cmake: nccl find lib without version. * flash attn: less memory footprint for compile. * workflow: fix typo. * ci: fix git safe dir issue. * multimodel: minor fix for benchmark * build: support 90a by default. * [Build]: upgrade pybind11 for python 3.12, also remove cutlass folder. * docker: fine centos docker and ubi8 docker. * add build script for ubi8 support. * github: action use ubi8 image rather than centos image. * github: try use github provided machine. * Revert "github: try use github provided machine." This reverts commit 6705c60. * github: remove prefix of docker image. * github: change workflow image to ubi8 images.
diff --git a/.github/workflows/build-check.yml b/.github/workflows/build-check.yml
@@ -12,14 +12,14 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [0, 1]
         exclude:
           - arch: X64
             image: "dev-centos8-arm:v2"
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
@@ -68,15 +68,12 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [0, 1]
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
-      # force use node16 instead of node20
-      # otherwise it may cause GLIBCXX_2.27 not found
-      # ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
       ENABLE_CUDA: ${{ matrix.enable_cuda }}
       BUILD_VERSION: 3.10
     steps:
diff --git a/.github/workflows/release_packages_all.yml b/.github/workflows/release_packages_all.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         arch: [X64, ARM64]
-        image: ["dev-centos7-cu124:v1", "dev-centos8-arm:v2"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest", "dashinfer/dev-centos8-arm:v2"]
         enable_cuda: [0, 1]
         exclude:
           - arch: X64
@@ -27,7 +27,7 @@ jobs:
             enable_cuda: 1
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
@@ -97,7 +97,7 @@ jobs:
     strategy:
       matrix:
         arch: [X64, ARM64]
-        image: ["dev-centos7-cu124:v1", "dev-centos8-arm:v2"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest", "dashinfer/dev-centos8-arm:v2"]
         enable_cuda: [0, 1]
         exclude:
           - arch: X64
@@ -108,7 +108,7 @@ jobs:
             enable_cuda: 1
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
diff --git a/.github/workflows/release_packages_cuda_only.yml b/.github/workflows/release_packages_cuda_only.yml
@@ -17,12 +17,12 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [1]
 
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
@@ -36,7 +36,8 @@ jobs:
       uses: actions/checkout@v4
       with:
         lfs: true
-        submdules: false
+        submodules: false
+
 
     - name: Build tgz package
       shell: bash
@@ -47,6 +48,7 @@ jobs:
         fi
         source activate ds_py
 
+        git config --global --add safe.directory '*'
         git fetch --tags
         TAG_NAME=$(git describe --tags $(git rev-list --tags --max-count=1))
         VERSION_NUMBER=$(echo "$TAG_NAME" | sed 's/^v//' | sed 's/-.*$//')
@@ -83,22 +85,20 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [1]
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
-      # force use node16 instead of node20
-      # otherwise it may cause GLIBCXX_2.27 not found
-      # ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
       ENABLE_CUDA: ${{ matrix.enable_cuda }}
     steps:
     - name: Check out code
       uses: actions/checkout@v4
       with:
         lfs: true
-        submdules: false
+        submodules: false
+
 
     - name: Build manylinux wheels
       shell: bash
@@ -109,6 +109,7 @@ jobs:
             source /miniconda/etc/profile.d/conda.sh
         fi
 
+        git config --global --add safe.directory '*'
         git fetch --tags
         TAG_NAME=$(git describe --tags $(git rev-list --tags --max-count=1))
         VERSION_NUMBER=$(echo "$TAG_NAME" | sed 's/^v//')
diff --git a/README.md b/README.md
@@ -15,7 +15,6 @@
 
 
 ## News
-
 - [2024/12] 🔥 DashInfer: Announcing the release of v2.0, now with enhanced GPU (CUDA) support! This version includes features like prefix caching (with GPU & CPU swapping), guided decoding, optimized attention for GQA, a lockless reactor engine, and newly added support for the VLM model (Qwen-VL) and MoE Models. For more details, please refer to the [release notes](https://dashinfer.readthedocs.io/en/latest/index.html#v2-0-0).
 
 - [2024/06] DashInfer:  v1.0 release with x86 & ARMv9 CPU and CPU flash attention support.
@@ -208,8 +207,8 @@ If you find them useful, please feel free to cite these papers:
 - [x] Prefix Cache: Support GPU Prefix Cache and CPU Swap 
 - [x] Quantization: Fp8 A8W8 Activation quantization support on CUDA.
 - [x] LORA: Continues Batch LORA Optimization.
-- [ ] Parallel Context phase and Generation phase within engine.
-- [ ] More effective MoE Operator on GPU.
+- [x] Parallel Context phase and Generation phase within engine.
+- [x] More effective MoE Operator on GPU.
 - [ ] Porting to AMD(ROCm) Platform.
 
 # License
diff --git a/build.sh b/build.sh
@@ -6,7 +6,7 @@ clean="OFF"
 with_platform="${AS_PLATFORM:-cuda}"
 # cuda related version, provide a defualt value for cuda 11.4
 cuda_version="${AS_CUDA_VERSION:-12.4}"
-cuda_sm="${AS_CUDA_SM:-80;86;90a}"
+cuda_sm="${AS_CUDA_SM:-80;90a}"
 NCCL_VERSION="${AS_NCCL_VERSION:-2.23.4}"
 build_folder="${AS_BUILD_FOLDER:-build}"
 force_conan="${AS_FORCE_CONAN:-OFF}"
diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake
@@ -18,9 +18,26 @@ else()
 endif()
 
 message("find nccl with ${NCCL_LIBNAME}")
+# 优先查找带版本号的 nccl 库
 find_library(
-  AS_NCCL_LIBRARY ${NCCL_LIBNAME}
-  PATH_SUFFIXES lib lib64 nccl-${NCCL_VERSION}-cuda-${CUDA_VERSION}/lib64)
+  AS_NCCL_LIBRARY_VERSIONED
+  NAMES nccl-${NCCL_VERSION}
+  PATH_SUFFIXES lib lib64 nccl-${NCCL_VERSION}-cuda-${CUDA_VERSION}/lib64
+)
+
+# 如果没找到，再查找不带版本号的 nccl 库
+if(NOT AS_NCCL_LIBRARY_VERSIONED)
+   message("find nccl without version number, searching ${CUDAToolkit_LIBRARY_DIR}")
+  find_library(
+    AS_NCCL_LIBRARY
+    NAMES nccl
+    PATHS 
+    ${CUDAToolkit_LIBRARY_DIR}
+  )
+else()
+  message("found nccl with version number")
+  set(AS_NCCL_LIBRARY ${AS_NCCL_LIBRARY_VERSIONED})
+endif()
 
 if(ENABLE_NV_STATIC_LIB)
   message("add nccl static lib")
@@ -42,7 +59,7 @@ install(FILES ${NCCL_LIBS}
         DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
-
+message("find nccl at ${NCCL_INCLUDE_DIR} lib: ${AS_NCCL_LIBRARY}")
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR
                                   AS_NCCL_LIBRARY)
 
diff --git a/cmake/flash-attention.cmake b/cmake/flash-attention.cmake
@@ -44,6 +44,8 @@ include(ExternalProject)
 
     message(STATUS "Use flash-attention from external project")
     set(FLASH_ATTENTION_GIT_REPO https://github.com/Dao-AILab/flash-attention.git)
+# mirror for china.
+#    set(FLASH_ATTENTION_GIT_REPO https://gitee.com/lanyuflying/flash-attention.git)
     set(FLASH_ATTENTION_GIT_TAG 7551202cb2dd245432bc878447e19015c0af3c22)
     set(FLASH_ATTENTION_GIT_PATCH ${PROJECT_SOURCE_DIR}/third_party/patch/flash-attn.patch)
 
@@ -60,7 +62,7 @@ include(ExternalProject)
     SOURCE_SUBDIR csrc
     DEPENDS project_cutlass
     CMAKE_GENERATOR "Ninja"
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . -j32 -v
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . -j2 -v
     BUILD_BYPRODUCTS ${FLASHATTN_LIBRARY_PATH}/${FLASHATTN_LIBRARY_NAME}
     USES_TERMINAL true
     CMAKE_CACHE_ARGS
diff --git a/conan/conanfile.txt b/conan/conanfile.txt
@@ -2,7 +2,7 @@
   protobuf/3.18.3
   gtest/1.11.0
   glog/0.5.0
-  pybind11/2.8.1
+  pybind11/2.13.6
   zlib/1.2.13
 [generators]
   cmake
diff --git a/conan/conanfile_arm.txt b/conan/conanfile_arm.txt
@@ -3,7 +3,7 @@
   gtest/1.11.0
   glog/0.5.0
   libunwind/1.7.2
-  pybind11/2.8.1
+  pybind11/2.13.6
   zlib/1.2.13
 [generators]
   cmake
diff --git a/conan/conanfile_openmpi.txt b/conan/conanfile_openmpi.txt
@@ -2,7 +2,7 @@
   protobuf/3.18.3
   gtest/1.11.0
   glog/0.5.0
-  pybind11/2.8.1
+  pybind11/2.13.6
   openmpi/4.1.0
   hwloc/2.9.3  # 2.10 only support shared lib, use older version to build static lib.
   zlib/1.2.13
diff --git a/conan/conanfile_openmpi_arm.txt b/conan/conanfile_openmpi_arm.txt
@@ -3,7 +3,7 @@
   gtest/1.11.0
   glog/0.5.0
   libunwind/1.7.2
-  pybind11/2.8.1
+  pybind11/2.13.6
   openmpi/4.1.0
   zlib/1.2.13
   grpc/1.50.1
diff --git a/examples/benchmark/requirements.txt b/examples/benchmark/requirements.txt
@@ -5,4 +5,3 @@ pandas
 tabulate
 tqdm
 optimum
-auto-gptq
diff --git a/multimodal/README.md b/multimodal/README.md
@@ -8,10 +8,15 @@ DashInfer VLMs is a toolkit to support Vision Language Models (VLMs) inference b
 
 ## Supported Models
 - Qwen2-VL 2B/7B/72B
+- Qwen2.5-VL 2B/7B/72B (Only support transformers vit engine)
 
 ## Architecture
 ![alt text](resource/dashinfer-vlm-arch.png)
 
+
+## Benchmark and Example
+- See [test/README](tests/README.md)
+
 ## API Reference
 Currently, some [openai chat completion API](https://platform.openai.com/docs/api-reference/chat) parameters are unavailable. Here are the list of supported parameters in dashinfer vlm.
 
diff --git a/multimodal/run_benchmark.sh b/multimodal/run_benchmark.sh
@@ -0,0 +1,7 @@
+python tests/benchmark_openai_api.py --prompt-file tests/data/docvqa_train_10k.jsonl --image-folder `pwd`/tests/data/share_textvqa/images/ --req-nums 100 \
+	--batch-size 32 \
+	--image-nums-mean 3 \
+	--image-nums-range 1  \
+	--response-mean 120 \
+	--response-len-range 64 \
+
diff --git a/multimodal/tests/benchmark_openai_api.py b/multimodal/tests/benchmark_openai_api.py
@@ -44,9 +44,9 @@ class BenchRequest:
 
 
 class OpenAIAPIBenchmark:
-    def __init__(self) -> None:
+    def __init__(self, host, port) -> None:
         openai_api_key = "EMPTY"
-        openai_api_base = "http://127.0.0.1:8000/v1"
+        openai_api_base = f"http://{host}:{port}/v1"
 
         self.client = OpenAI(
             api_key=openai_api_key,
@@ -269,6 +269,8 @@ def print_profiling_data(total_timecost):
     parser.add_argument("--image-nums-range", type=int, default=1)
     parser.add_argument("--frequency", type=float, default=1000)
     parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="localhost")
     args = parser.parse_args()
 
     ds = load_dataset("json", data_files=args.prompt_file, split="train")
@@ -298,7 +300,7 @@ def print_profiling_data(total_timecost):
         image_list, qa, args.req_nums, args.multi_turn, response_lens, image_nums
     )
 
-    model = OpenAIAPIBenchmark()
+    model = OpenAIAPIBenchmark(args.host, args.port)
 
     global_start = time.time()
 
diff --git a/multimodal/tests/data/download_data.sh b/multimodal/tests/data/download_data.sh
@@ -0,0 +1,3 @@
+wget https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data/resolve/main/opensource/docvqa_train_10k.jsonl
+wget https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data/resolve/main/data/share_textvqa.zip
+unzip share_textvqa.zip
diff --git a/multimodal/tests/test_openai_chat_completion.py b/multimodal/tests/test_openai_chat_completion.py
@@ -127,7 +127,7 @@ def main(args, client):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str,
-                        default="0.0.0.0")
+                        default="localhost")
     parser.add_argument('--port', type=str,
                         default="8000")
     parser.add_argument('--type', type=str, default="all", choices=["all", "single_image", "multi_images", "video"])
diff --git a/python/requirements_dev.txt b/python/requirements_dev.txt
@@ -13,7 +13,6 @@ peft
 deepspeed
 
 optimum
-auto-gptq
 
 # xformers
 # transformers>=4.32.0
diff --git a/python/setup.py b/python/setup.py
@@ -56,7 +56,7 @@ def build_extension(self, ext):
         # 11.4
         cuda_version = os.getenv("AS_CUDA_VERSION", "12.4")
         nccl_version = os.getenv("AS_NCCL_VERSION", "2.23.4")
-        cuda_sm = os.getenv("AS_CUDA_SM", "'80;86'")
+        cuda_sm = os.getenv("AS_CUDA_SM", "'80;90a'")
         nv_system_lib = os.getenv("AS_SYSTEM_NV_LIB", "OFF")
         config = os.getenv("AS_BUILD_TYPE", "Release")
         as_platform = os.getenv("AS_PLATFORM", "cuda")
diff --git a/scripts/docker/dev_cuda_124.Dockerfile b/scripts/docker/dev_cuda_124.Dockerfile
diff --git a/scripts/docker/dev_ubi8_cuda_124.Dockerfile b/scripts/docker/dev_ubi8_cuda_124.Dockerfile
diff --git a/scripts/release/python_manylinux_build.sh b/scripts/release/python_manylinux_build.sh
diff --git a/scripts/release/python_manylinux_build_cuda.sh b/scripts/release/python_manylinux_build_cuda.sh
diff --git a/third_party/from_source/cutlass b/third_party/from_source/cutlass
diff --git a/third_party/patch/flash-attn.patch b/third_party/patch/flash-attn.patch