Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a0b1e39
Update build-check-share-runner.yml
kzjeef May 24, 2025
a2364a4
ci: adjust code clone for build test.
kzjeef May 25, 2025
562d496
thridparty: reduce mkl size, remove unused so.
kzjeef May 26, 2025
6adb588
xformers: replace link with source.
kzjeef May 26, 2025
1848c40
cutlass: try use local tar rather than git submodule.
kzjeef May 27, 2025
f2dd0aa
ci: remove shared runner
kzjeef May 28, 2025
169754a
Update README.md
kzjeef Jun 30, 2025
3b18ae9
mm: add qwen vl2.5 model support. (#86)
kzjeef May 28, 2025
0990098
Update build-check.yml
kzjeef May 29, 2025
f845eb8
Update build-check.yml
kzjeef May 29, 2025
cdcf2be
ci: fix release script. (#88)
kzjeef May 30, 2025
a776706
cmake: nccl find lib without version.
kzjeef Jul 20, 2025
3625b4a
flash attn: less memory footprint for compile.
kzjeef Jul 20, 2025
6513788
workflow: fix typo.
kzjeef Jul 20, 2025
db372c3
ci: fix git safe dir issue.
kzjeef Jul 23, 2025
b70fea6
multimodel: minor fix for benchmark
Jul 25, 2025
5fdca5c
build: support 90a by default.
Jul 25, 2025
5339d55
[Build]: upgrade pybind11 for python 3.12, also remove cutlass folder.
Jul 25, 2025
b3685e2
docker: fine centos docker and ubi8 docker.
kzjeef Jul 26, 2025
2fd9952
add build script for ubi8 support.
Jul 28, 2025
dec6847
github: action use ubi8 image rather than centos image.
Jul 28, 2025
6705c60
github: try use github provided machine.
Jul 28, 2025
ba55f05
Revert "github: try use github provided machine."
Jul 28, 2025
89c8bd9
github: remove prefix of docker image.
Jul 28, 2025
1749dd8
github: change workflow image to ubi8 images.
Jul 28, 2025
166be5f
Merge branch 'main' into main
kzjeef Jul 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions .github/workflows/build-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ jobs:
strategy:
matrix:
arch: [X64]
image: ["dev-centos7-cu124:v1"]
image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
enable_cuda: [0, 1]
exclude:
- arch: X64
image: "dev-centos8-arm:v2"
runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
container:
image: dashinfer/${{ matrix.image }}
image: ${{ matrix.image }}
env:
# force use node16 instead of node20
# otherwise it may cause GLIBCXX_2.27 not found
Expand Down Expand Up @@ -68,15 +68,12 @@ jobs:
strategy:
matrix:
arch: [X64]
image: ["dev-centos7-cu124:v1"]
image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
enable_cuda: [0, 1]
runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
container:
image: dashinfer/${{ matrix.image }}
image: ${{ matrix.image }}
env:
# force use node16 instead of node20
# otherwise it may cause GLIBCXX_2.27 not found
# ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
ENABLE_CUDA: ${{ matrix.enable_cuda }}
BUILD_VERSION: 3.10
steps:
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/release_packages_all.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
arch: [X64, ARM64]
image: ["dev-centos7-cu124:v1", "dev-centos8-arm:v2"]
image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest", "dashinfer/dev-centos8-arm:v2"]
enable_cuda: [0, 1]
exclude:
- arch: X64
Expand All @@ -27,7 +27,7 @@ jobs:
enable_cuda: 1
runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
container:
image: dashinfer/${{ matrix.image }}
image: ${{ matrix.image }}
env:
# force use node16 instead of node20
# otherwise it may cause GLIBCXX_2.27 not found
Expand Down Expand Up @@ -97,7 +97,7 @@ jobs:
strategy:
matrix:
arch: [X64, ARM64]
image: ["dev-centos7-cu124:v1", "dev-centos8-arm:v2"]
image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest", "dashinfer/dev-centos8-arm:v2"]
enable_cuda: [0, 1]
exclude:
- arch: X64
Expand All @@ -108,7 +108,7 @@ jobs:
enable_cuda: 1
runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
container:
image: dashinfer/${{ matrix.image }}
image: ${{ matrix.image }}
env:
# force use node16 instead of node20
# otherwise it may cause GLIBCXX_2.27 not found
Expand Down
19 changes: 10 additions & 9 deletions .github/workflows/release_packages_cuda_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ jobs:
strategy:
matrix:
arch: [X64]
image: ["dev-centos7-cu124:v1"]
image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
enable_cuda: [1]

runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
container:
image: dashinfer/${{ matrix.image }}
image: ${{ matrix.image }}
env:
# force use node16 instead of node20
# otherwise it may cause GLIBCXX_2.27 not found
Expand All @@ -36,7 +36,8 @@ jobs:
uses: actions/checkout@v4
with:
lfs: true
submdules: false
submodules: false


- name: Build tgz package
shell: bash
Expand All @@ -47,6 +48,7 @@ jobs:
fi
source activate ds_py

git config --global --add safe.directory '*'
git fetch --tags
TAG_NAME=$(git describe --tags $(git rev-list --tags --max-count=1))
VERSION_NUMBER=$(echo "$TAG_NAME" | sed 's/^v//' | sed 's/-.*$//')
Expand Down Expand Up @@ -83,22 +85,20 @@ jobs:
strategy:
matrix:
arch: [X64]
image: ["dev-centos7-cu124:v1"]
image: ["docker.cnb.cool/thinksrc/dashinfer/dev-ubi8-cu124:latest"]
enable_cuda: [1]
runs-on: [self-hosted, Linux, "${{ matrix.arch }}"]
container:
image: dashinfer/${{ matrix.image }}
image: ${{ matrix.image }}
env:
# force use node16 instead of node20
# otherwise it may cause GLIBCXX_2.27 not found
# ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
ENABLE_CUDA: ${{ matrix.enable_cuda }}
steps:
- name: Check out code
uses: actions/checkout@v4
with:
lfs: true
submdules: false
submodules: false


- name: Build manylinux wheels
shell: bash
Expand All @@ -109,6 +109,7 @@ jobs:
source /miniconda/etc/profile.d/conda.sh
fi

git config --global --add safe.directory '*'
git fetch --tags
TAG_NAME=$(git describe --tags $(git rev-list --tags --max-count=1))
VERSION_NUMBER=$(echo "$TAG_NAME" | sed 's/^v//')
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


## News

- [2024/12] 🔥 DashInfer: Announcing the release of v2.0, now with enhanced GPU (CUDA) support! This version includes features like prefix caching (with GPU & CPU swapping), guided decoding, optimized attention for GQA, a lockless reactor engine, and newly added support for the VLM model (Qwen-VL) and MoE Models. For more details, please refer to the [release notes](https://dashinfer.readthedocs.io/en/latest/index.html#v2-0-0).

- [2024/06] DashInfer: v1.0 release with x86 & ARMv9 CPU and CPU flash attention support.
Expand Down Expand Up @@ -208,8 +207,8 @@ If you find them useful, please feel free to cite these papers:
- [x] Prefix Cache: Support GPU Prefix Cache and CPU Swap
- [x] Quantization: Fp8 A8W8 Activation quantization support on CUDA.
- [x] LORA: Continues Batch LORA Optimization.
- [ ] Parallel Context phase and Generation phase within engine.
- [ ] More effective MoE Operator on GPU.
- [x] Parallel Context phase and Generation phase within engine.
- [x] More effective MoE Operator on GPU.
- [ ] Porting to AMD(ROCm) Platform.

# License
Expand Down
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ clean="OFF"
with_platform="${AS_PLATFORM:-cuda}"
# cuda related version, provide a defualt value for cuda 11.4
cuda_version="${AS_CUDA_VERSION:-12.4}"
cuda_sm="${AS_CUDA_SM:-80;86;90a}"
cuda_sm="${AS_CUDA_SM:-80;90a}"
NCCL_VERSION="${AS_NCCL_VERSION:-2.23.4}"
build_folder="${AS_BUILD_FOLDER:-build}"
force_conan="${AS_FORCE_CONAN:-OFF}"
Expand Down
23 changes: 20 additions & 3 deletions cmake/FindNCCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,26 @@ else()
endif()

message("find nccl with ${NCCL_LIBNAME}")
# 优先查找带版本号的 nccl 库
find_library(
AS_NCCL_LIBRARY ${NCCL_LIBNAME}
PATH_SUFFIXES lib lib64 nccl-${NCCL_VERSION}-cuda-${CUDA_VERSION}/lib64)
AS_NCCL_LIBRARY_VERSIONED
NAMES nccl-${NCCL_VERSION}
PATH_SUFFIXES lib lib64 nccl-${NCCL_VERSION}-cuda-${CUDA_VERSION}/lib64
)

# 如果没找到,再查找不带版本号的 nccl 库
if(NOT AS_NCCL_LIBRARY_VERSIONED)
message("find nccl without version number, searching ${CUDAToolkit_LIBRARY_DIR}")
find_library(
AS_NCCL_LIBRARY
NAMES nccl
PATHS
${CUDAToolkit_LIBRARY_DIR}
)
else()
message("found nccl with version number")
set(AS_NCCL_LIBRARY ${AS_NCCL_LIBRARY_VERSIONED})
endif()

if(ENABLE_NV_STATIC_LIB)
message("add nccl static lib")
Expand All @@ -42,7 +59,7 @@ install(FILES ${NCCL_LIBS}
DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()


message("find nccl at ${NCCL_INCLUDE_DIR} lib: ${AS_NCCL_LIBRARY}")
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR
AS_NCCL_LIBRARY)

Expand Down
4 changes: 3 additions & 1 deletion cmake/flash-attention.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ include(ExternalProject)

message(STATUS "Use flash-attention from external project")
set(FLASH_ATTENTION_GIT_REPO https://github.com/Dao-AILab/flash-attention.git)
# mirror for china.
# set(FLASH_ATTENTION_GIT_REPO https://gitee.com/lanyuflying/flash-attention.git)
set(FLASH_ATTENTION_GIT_TAG 7551202cb2dd245432bc878447e19015c0af3c22)
set(FLASH_ATTENTION_GIT_PATCH ${PROJECT_SOURCE_DIR}/third_party/patch/flash-attn.patch)

Expand All @@ -60,7 +62,7 @@ include(ExternalProject)
SOURCE_SUBDIR csrc
DEPENDS project_cutlass
CMAKE_GENERATOR "Ninja"
BUILD_COMMAND ${CMAKE_COMMAND} --build . -j32 -v
BUILD_COMMAND ${CMAKE_COMMAND} --build . -j2 -v
BUILD_BYPRODUCTS ${FLASHATTN_LIBRARY_PATH}/${FLASHATTN_LIBRARY_NAME}
USES_TERMINAL true
CMAKE_CACHE_ARGS
Expand Down
2 changes: 1 addition & 1 deletion conan/conanfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
protobuf/3.18.3
gtest/1.11.0
glog/0.5.0
pybind11/2.8.1
pybind11/2.13.6
zlib/1.2.13
[generators]
cmake
Expand Down
2 changes: 1 addition & 1 deletion conan/conanfile_arm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
gtest/1.11.0
glog/0.5.0
libunwind/1.7.2
pybind11/2.8.1
pybind11/2.13.6
zlib/1.2.13
[generators]
cmake
Expand Down
2 changes: 1 addition & 1 deletion conan/conanfile_openmpi.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
protobuf/3.18.3
gtest/1.11.0
glog/0.5.0
pybind11/2.8.1
pybind11/2.13.6
openmpi/4.1.0
hwloc/2.9.3 # 2.10 only support shared lib, use older version to build static lib.
zlib/1.2.13
Expand Down
2 changes: 1 addition & 1 deletion conan/conanfile_openmpi_arm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
gtest/1.11.0
glog/0.5.0
libunwind/1.7.2
pybind11/2.8.1
pybind11/2.13.6
openmpi/4.1.0
zlib/1.2.13
grpc/1.50.1
Expand Down
1 change: 0 additions & 1 deletion examples/benchmark/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@ pandas
tabulate
tqdm
optimum
auto-gptq
5 changes: 5 additions & 0 deletions multimodal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,15 @@ DashInfer VLMs is a toolkit to support Vision Language Models (VLMs) inference b

## Supported Models
- Qwen2-VL 2B/7B/72B
- Qwen2.5-VL 2B/7B/72B (Only support transformers vit engine)

## Architecture
![alt text](resource/dashinfer-vlm-arch.png)


## Benchmark and Example
- See [test/README](tests/README.md)

## API Reference
Currently, some [openai chat completion API](https://platform.openai.com/docs/api-reference/chat) parameters are unavailable. Here are the list of supported parameters in dashinfer vlm.

Expand Down
7 changes: 7 additions & 0 deletions multimodal/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python tests/benchmark_openai_api.py --prompt-file tests/data/docvqa_train_10k.jsonl --image-folder `pwd`/tests/data/share_textvqa/images/ --req-nums 100 \
--batch-size 32 \
--image-nums-mean 3 \
--image-nums-range 1 \
--response-mean 120 \
--response-len-range 64 \

8 changes: 5 additions & 3 deletions multimodal/tests/benchmark_openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ class BenchRequest:


class OpenAIAPIBenchmark:
def __init__(self) -> None:
def __init__(self, host, port) -> None:
openai_api_key = "EMPTY"
openai_api_base = "http://127.0.0.1:8000/v1"
openai_api_base = f"http://{host}:{port}/v1"

self.client = OpenAI(
api_key=openai_api_key,
Expand Down Expand Up @@ -269,6 +269,8 @@ def print_profiling_data(total_timecost):
parser.add_argument("--image-nums-range", type=int, default=1)
parser.add_argument("--frequency", type=float, default=1000)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--host", type=str, default="localhost")
args = parser.parse_args()

ds = load_dataset("json", data_files=args.prompt_file, split="train")
Expand Down Expand Up @@ -298,7 +300,7 @@ def print_profiling_data(total_timecost):
image_list, qa, args.req_nums, args.multi_turn, response_lens, image_nums
)

model = OpenAIAPIBenchmark()
model = OpenAIAPIBenchmark(args.host, args.port)

global_start = time.time()

Expand Down
3 changes: 3 additions & 0 deletions multimodal/tests/data/download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
wget https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data/resolve/main/opensource/docvqa_train_10k.jsonl
wget https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data/resolve/main/data/share_textvqa.zip
unzip share_textvqa.zip
2 changes: 1 addition & 1 deletion multimodal/tests/test_openai_chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def main(args, client):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--host', type=str,
default="0.0.0.0")
default="localhost")
parser.add_argument('--port', type=str,
default="8000")
parser.add_argument('--type', type=str, default="all", choices=["all", "single_image", "multi_images", "video"])
Expand Down
1 change: 0 additions & 1 deletion python/requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ peft
deepspeed

optimum
auto-gptq

# xformers
# transformers>=4.32.0
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def build_extension(self, ext):
# 11.4
cuda_version = os.getenv("AS_CUDA_VERSION", "12.4")
nccl_version = os.getenv("AS_NCCL_VERSION", "2.23.4")
cuda_sm = os.getenv("AS_CUDA_SM", "'80;86'")
cuda_sm = os.getenv("AS_CUDA_SM", "'80;90a'")
nv_system_lib = os.getenv("AS_SYSTEM_NV_LIB", "OFF")
config = os.getenv("AS_BUILD_TYPE", "Release")
as_platform = os.getenv("AS_PLATFORM", "cuda")
Expand Down
Loading
Loading