modelscope · kzjeef · Jul 28, 2025 · May 24, 2025 · May 25, 2025 · May 26, 2025
diff --git a/.github/workflows/build-check.yml b/.github/workflows/build-check.yml
@@ -12,14 +12,14 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [0, 1]
         exclude:
           - arch: X64
             image: "dev-centos8-arm:v2"
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
@@ -68,15 +68,12 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [0, 1]
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
-      # force use node16 instead of node20
-      # otherwise it may cause GLIBCXX_2.27 not found
-      # ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
       ENABLE_CUDA: ${{ matrix.enable_cuda }}
       BUILD_VERSION: 3.10
     steps:

diff --git a/.github/workflows/release_packages_all.yml b/.github/workflows/release_packages_all.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         arch: [X64, ARM64]
-        image: ["dev-centos7-cu124:v1", "dev-centos8-arm:v2"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest", "dashinfer/dev-centos8-arm:v2"]
         enable_cuda: [0, 1]
         exclude:
           - arch: X64
@@ -27,7 +27,7 @@ jobs:
             enable_cuda: 1
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
@@ -97,7 +97,7 @@ jobs:
     strategy:
       matrix:
         arch: [X64, ARM64]
-        image: ["dev-centos7-cu124:v1", "dev-centos8-arm:v2"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest", "dashinfer/dev-centos8-arm:v2"]
         enable_cuda: [0, 1]
         exclude:
           - arch: X64
@@ -108,7 +108,7 @@ jobs:
             enable_cuda: 1
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found

diff --git a/.github/workflows/release_packages_cuda_only.yml b/.github/workflows/release_packages_cuda_only.yml
@@ -17,12 +17,12 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [1]
 
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
       # force use node16 instead of node20
       # otherwise it may cause GLIBCXX_2.27 not found
@@ -36,7 +36,8 @@ jobs:
       uses: actions/checkout@v4
       with:
         lfs: true
-        submdules: false
+        submodules: false
+
 
     - name: Build tgz package
       shell: bash
@@ -47,6 +48,7 @@ jobs:
         fi
         source activate ds_py
 
+        git config --global --add safe.directory '*'
         git fetch --tags
         TAG_NAME=$(git describe --tags $(git rev-list --tags --max-count=1))
         VERSION_NUMBER=$(echo "$TAG_NAME" | sed 's/^v//' | sed 's/-.*$//')
@@ -83,22 +85,20 @@ jobs:
     strategy:
       matrix:
         arch: [X64]
-        image: ["dev-centos7-cu124:v1"]
+        image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
         enable_cuda: [1]
     runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
     container:
-      image: dashinfer/${{ matrix.image }}
+      image: ${{ matrix.image }}
     env:
-      # force use node16 instead of node20
-      # otherwise it may cause GLIBCXX_2.27 not found
-      # ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
       ENABLE_CUDA: ${{ matrix.enable_cuda }}
     steps:
     - name: Check out code
       uses: actions/checkout@v4
       with:
         lfs: true
-        submdules: false
+        submodules: false
+
 
     - name: Build manylinux wheels
       shell: bash
@@ -109,6 +109,7 @@ jobs:
             source /miniconda/etc/profile.d/conda.sh
         fi
 
+        git config --global --add safe.directory '*'
         git fetch --tags
         TAG_NAME=$(git describe --tags $(git rev-list --tags --max-count=1))
         VERSION_NUMBER=$(echo "$TAG_NAME" | sed 's/^v//')

diff --git a/README.md b/README.md
@@ -15,7 +15,6 @@
 
 
 ## News
-
 - [2024/12] 🔥 DashInfer: Announcing the release of v2.0, now with enhanced GPU (CUDA) support! This version includes features like prefix caching (with GPU & CPU swapping), guided decoding, optimized attention for GQA, a lockless reactor engine, and newly added support for the VLM model (Qwen-VL) and MoE Models. For more details, please refer to the [release notes](https://dashinfer.readthedocs.io/en/latest/index.html#v2-0-0).
 
 - [2024/06] DashInfer:  v1.0 release with x86 & ARMv9 CPU and CPU flash attention support.
@@ -208,8 +207,8 @@ If you find them useful, please feel free to cite these papers:
 - [x] Prefix Cache: Support GPU Prefix Cache and CPU Swap 
 - [x] Quantization: Fp8 A8W8 Activation quantization support on CUDA.
 - [x] LORA: Continues Batch LORA Optimization.
-- [ ] Parallel Context phase and Generation phase within engine.
-- [ ] More effective MoE Operator on GPU.
+- [x] Parallel Context phase and Generation phase within engine.
+- [x] More effective MoE Operator on GPU.
 - [ ] Porting to AMD(ROCm) Platform.
 
 # License

diff --git a/build.sh b/build.sh
@@ -6,7 +6,7 @@ clean="OFF"
 with_platform="${AS_PLATFORM:-cuda}"
 # cuda related version, provide a defualt value for cuda 11.4
 cuda_version="${AS_CUDA_VERSION:-12.4}"
-cuda_sm="${AS_CUDA_SM:-80;86;90a}"
+cuda_sm="${AS_CUDA_SM:-80;90a}"
 NCCL_VERSION="${AS_NCCL_VERSION:-2.23.4}"
 build_folder="${AS_BUILD_FOLDER:-build}"
 force_conan="${AS_FORCE_CONAN:-OFF}"

diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake
@@ -18,9 +18,26 @@ else()
 endif()
 
 message("find nccl with ${NCCL_LIBNAME}")
+# 优先查找带版本号的 nccl 库
 find_library(
-  AS_NCCL_LIBRARY ${NCCL_LIBNAME}
-  PATH_SUFFIXES lib lib64 nccl-${NCCL_VERSION}-cuda-${CUDA_VERSION}/lib64)
+  AS_NCCL_LIBRARY_VERSIONED
+  NAMES nccl-${NCCL_VERSION}
+  PATH_SUFFIXES lib lib64 nccl-${NCCL_VERSION}-cuda-${CUDA_VERSION}/lib64
+)
+
+# 如果没找到，再查找不带版本号的 nccl 库
+if(NOT AS_NCCL_LIBRARY_VERSIONED)
+   message("find nccl without version number, searching ${CUDAToolkit_LIBRARY_DIR}")
+  find_library(
+    AS_NCCL_LIBRARY
+    NAMES nccl
+    PATHS 
+    ${CUDAToolkit_LIBRARY_DIR}
+  )
+else()
+  message("found nccl with version number")
+  set(AS_NCCL_LIBRARY ${AS_NCCL_LIBRARY_VERSIONED})
+endif()
 
 if(ENABLE_NV_STATIC_LIB)
   message("add nccl static lib")
@@ -42,7 +59,7 @@ install(FILES ${NCCL_LIBS}
         DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
-
+message("find nccl at ${NCCL_INCLUDE_DIR} lib: ${AS_NCCL_LIBRARY}")
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR
                                   AS_NCCL_LIBRARY)
 

diff --git a/cmake/flash-attention.cmake b/cmake/flash-attention.cmake
@@ -44,6 +44,8 @@ include(ExternalProject)
 
     message(STATUS "Use flash-attention from external project")
     set(FLASH_ATTENTION_GIT_REPO https://github.com/Dao-AILab/flash-attention.git)
+# mirror for china.
+#    set(FLASH_ATTENTION_GIT_REPO https://gitee.com/lanyuflying/flash-attention.git)
     set(FLASH_ATTENTION_GIT_TAG 7551202cb2dd245432bc878447e19015c0af3c22)
     set(FLASH_ATTENTION_GIT_PATCH ${PROJECT_SOURCE_DIR}/third_party/patch/flash-attn.patch)
 
@@ -60,7 +62,7 @@ include(ExternalProject)
     SOURCE_SUBDIR csrc
     DEPENDS project_cutlass
     CMAKE_GENERATOR "Ninja"
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . -j32 -v
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . -j2 -v
     BUILD_BYPRODUCTS ${FLASHATTN_LIBRARY_PATH}/${FLASHATTN_LIBRARY_NAME}
     USES_TERMINAL true
     CMAKE_CACHE_ARGS

diff --git a/conan/conanfile.txt b/conan/conanfile.txt
@@ -2,7 +2,7 @@
   protobuf/3.18.3
   gtest/1.11.0
   glog/0.5.0
-  pybind11/2.8.1
+  pybind11/2.13.6
   zlib/1.2.13
 [generators]
   cmake

diff --git a/conan/conanfile_arm.txt b/conan/conanfile_arm.txt
@@ -3,7 +3,7 @@
   gtest/1.11.0
   glog/0.5.0
   libunwind/1.7.2
-  pybind11/2.8.1
+  pybind11/2.13.6
   zlib/1.2.13
 [generators]
   cmake

diff --git a/conan/conanfile_openmpi.txt b/conan/conanfile_openmpi.txt
@@ -2,7 +2,7 @@
   protobuf/3.18.3
   gtest/1.11.0
   glog/0.5.0
-  pybind11/2.8.1
+  pybind11/2.13.6
   openmpi/4.1.0
   hwloc/2.9.3  # 2.10 only support shared lib, use older version to build static lib.
   zlib/1.2.13

diff --git a/conan/conanfile_openmpi_arm.txt b/conan/conanfile_openmpi_arm.txt
@@ -3,7 +3,7 @@
   gtest/1.11.0
   glog/0.5.0
   libunwind/1.7.2
-  pybind11/2.8.1
+  pybind11/2.13.6
   openmpi/4.1.0
   zlib/1.2.13
   grpc/1.50.1

diff --git a/examples/benchmark/requirements.txt b/examples/benchmark/requirements.txt
@@ -5,4 +5,3 @@ pandas
 tabulate
 tqdm
 optimum
-auto-gptq
diff --git a/multimodal/README.md b/multimodal/README.md
@@ -8,10 +8,15 @@ DashInfer VLMs is a toolkit to support Vision Language Models (VLMs) inference b
 
 ## Supported Models
 - Qwen2-VL 2B/7B/72B
+- Qwen2.5-VL 2B/7B/72B (Only support transformers vit engine)
 
 ## Architecture
 ![alt text](resource/dashinfer-vlm-arch.png)
 
+
+## Benchmark and Example
+- See [test/README](tests/README.md)
+
 ## API Reference
 Currently, some [openai chat completion API](https://platform.openai.com/docs/api-reference/chat) parameters are unavailable. Here are the list of supported parameters in dashinfer vlm.
 

diff --git a/multimodal/run_benchmark.sh b/multimodal/run_benchmark.sh
@@ -0,0 +1,7 @@
+python tests/benchmark_openai_api.py --prompt-file tests/data/docvqa_train_10k.jsonl --image-folder `pwd`/tests/data/share_textvqa/images/ --req-nums 100 \
+	--batch-size 32 \
+	--image-nums-mean 3 \
+	--image-nums-range 1  \
+	--response-mean 120 \
+	--response-len-range 64 \
+
diff --git a/multimodal/tests/benchmark_openai_api.py b/multimodal/tests/benchmark_openai_api.py
@@ -44,9 +44,9 @@ class BenchRequest:
 
 
 class OpenAIAPIBenchmark:
-    def __init__(self) -> None:
+    def __init__(self, host, port) -> None:
         openai_api_key = "EMPTY"
-        openai_api_base = "http://127.0.0.1:8000/v1"
+        openai_api_base = f"http://{host}:{port}/v1"
 
         self.client = OpenAI(
             api_key=openai_api_key,
@@ -269,6 +269,8 @@ def print_profiling_data(total_timecost):
     parser.add_argument("--image-nums-range", type=int, default=1)
     parser.add_argument("--frequency", type=float, default=1000)
     parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="localhost")
     args = parser.parse_args()
 
     ds = load_dataset("json", data_files=args.prompt_file, split="train")
@@ -298,7 +300,7 @@ def print_profiling_data(total_timecost):
         image_list, qa, args.req_nums, args.multi_turn, response_lens, image_nums
     )
 
-    model = OpenAIAPIBenchmark()
+    model = OpenAIAPIBenchmark(args.host, args.port)
 
     global_start = time.time()
 

diff --git a/multimodal/tests/data/download_data.sh b/multimodal/tests/data/download_data.sh
@@ -0,0 +1,3 @@
+wget https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data/resolve/main/opensource/docvqa_train_10k.jsonl
+wget https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data/resolve/main/data/share_textvqa.zip
+unzip share_textvqa.zip
diff --git a/multimodal/tests/test_openai_chat_completion.py b/multimodal/tests/test_openai_chat_completion.py
@@ -127,7 +127,7 @@ def main(args, client):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str,
-                        default="0.0.0.0")
+                        default="localhost")
     parser.add_argument('--port', type=str,
                         default="8000")
     parser.add_argument('--type', type=str, default="all", choices=["all", "single_image", "multi_images", "video"])

diff --git a/python/requirements_dev.txt b/python/requirements_dev.txt
@@ -13,7 +13,6 @@ peft
 deepspeed
 
 optimum
-auto-gptq
 
 # xformers
 # transformers>=4.32.0

diff --git a/python/setup.py b/python/setup.py
@@ -56,7 +56,7 @@ def build_extension(self, ext):
         # 11.4
         cuda_version = os.getenv("AS_CUDA_VERSION", "12.4")
         nccl_version = os.getenv("AS_NCCL_VERSION", "2.23.4")
-        cuda_sm = os.getenv("AS_CUDA_SM", "'80;86'")
+        cuda_sm = os.getenv("AS_CUDA_SM", "'80;90a'")
         nv_system_lib = os.getenv("AS_SYSTEM_NV_LIB", "OFF")
         config = os.getenv("AS_BUILD_TYPE", "Release")
         as_platform = os.getenv("AS_PLATFORM", "cuda")
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,4 +5,3 @@ pandas @@
     tabulate
     tqdm
     optimum
-    auto-gptq