Skip to content

Commit f5d3d93

Browse files
authored
[docker] Build CUDA kernels in separate Docker stage for faster rebuilds (#29452)
Signed-off-by: Amr Mahdi <[email protected]>
1 parent 78f4bb0 commit f5d3d93

File tree

4 files changed

+74
-11
lines changed

4 files changed

+74
-11
lines changed

docker/Dockerfile

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
150150
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
151151
#################### BASE BUILD IMAGE ####################
152152

153-
#################### WHEEL BUILD IMAGE ####################
154-
FROM base AS build
153+
#################### CSRC BUILD IMAGE ####################
154+
FROM base AS csrc-build
155155
ARG TARGETPLATFORM
156156

157157
ARG PIP_INDEX_URL UV_INDEX_URL
@@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
172172
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
173173
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
174174

175-
COPY . .
176-
ARG GIT_REPO_CHECK=0
177-
RUN --mount=type=bind,source=.git,target=.git \
178-
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
175+
WORKDIR /workspace
176+
177+
COPY pyproject.toml setup.py CMakeLists.txt ./
178+
COPY cmake cmake/
179+
COPY csrc csrc/
180+
COPY vllm/envs.py vllm/envs.py
181+
COPY vllm/__init__.py vllm/__init__.py
179182

180183
# max jobs used by Ninja to build extensions
181184
ARG max_jobs=2
@@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
195198
ARG VLLM_USE_PRECOMPILED=""
196199
ARG VLLM_MAIN_CUDA_VERSION=""
197200

201+
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
202+
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
203+
198204
# if USE_SCCACHE is set, use sccache to speed up compilation
199205
RUN --mount=type=cache,target=/root/.cache/uv \
200-
--mount=type=bind,source=.git,target=.git \
201206
if [ "$USE_SCCACHE" = "1" ]; then \
202207
echo "Installing sccache..." \
203208
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
@@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
223228
ENV CCACHE_DIR=/root/.cache/ccache
224229
RUN --mount=type=cache,target=/root/.cache/ccache \
225230
--mount=type=cache,target=/root/.cache/uv \
226-
--mount=type=bind,source=.git,target=.git \
227231
if [ "$USE_SCCACHE" != "1" ]; then \
228232
# Clean any existing CMake artifacts
229233
rm -rf .deps && \
@@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
232236
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
233237
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
234238
fi
239+
#################### CSRC BUILD IMAGE ####################
240+
241+
#################### WHEEL BUILD IMAGE ####################
242+
FROM base AS build
243+
ARG TARGETPLATFORM
244+
245+
ARG PIP_INDEX_URL UV_INDEX_URL
246+
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
247+
ARG PYTORCH_CUDA_INDEX_BASE_URL
248+
249+
# install build dependencies
250+
COPY requirements/build.txt requirements/build.txt
251+
252+
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
253+
# Reference: https://github.com/astral-sh/uv/pull/1694
254+
ENV UV_HTTP_TIMEOUT=500
255+
ENV UV_INDEX_STRATEGY="unsafe-best-match"
256+
# Use copy mode to avoid hardlink failures with Docker cache mounts
257+
ENV UV_LINK_MODE=copy
258+
259+
RUN --mount=type=cache,target=/root/.cache/uv \
260+
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
261+
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
262+
263+
WORKDIR /workspace
264+
265+
COPY --from=csrc-build /workspace/dist /precompiled-wheels
266+
267+
COPY . .
268+
269+
ARG GIT_REPO_CHECK=0
270+
RUN --mount=type=bind,source=.git,target=.git \
271+
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
272+
273+
ARG vllm_target_device="cuda"
274+
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
275+
276+
# Skip adding +precompiled suffix to version (preserves git-derived version)
277+
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
278+
279+
RUN --mount=type=cache,target=/root/.cache/uv \
280+
--mount=type=bind,source=.git,target=.git \
281+
if [ "${vllm_target_device}" = "cuda" ]; then \
282+
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
283+
fi && \
284+
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
235285

236286
# Install DeepGEMM from source
237287
ARG DEEPGEMM_GIT_REF
27.8 KB
Loading

setup.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,14 +461,22 @@ def extract_precompiled_and_patch_package(
461461
"vllm/cumem_allocator.abi3.so",
462462
]
463463

464-
compiled_regex = re.compile(
464+
flash_attn_regex = re.compile(
465465
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
466466
)
467+
triton_kernels_regex = re.compile(
468+
r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
469+
)
467470
file_members = list(
468471
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
469472
)
470473
file_members += list(
471-
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
474+
filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
475+
)
476+
file_members += list(
477+
filter(
478+
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
479+
)
472480
)
473481

474482
for file in file_members:
@@ -648,7 +656,7 @@ def get_vllm_version() -> str:
648656
if envs.VLLM_TARGET_DEVICE == "empty":
649657
version += f"{sep}empty"
650658
elif _is_cuda():
651-
if envs.VLLM_USE_PRECOMPILED:
659+
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
652660
version += f"{sep}precompiled"
653661
else:
654662
cuda_version = str(get_nvcc_cuda_version())

vllm/envs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
MAX_JOBS: str | None = None
7979
NVCC_THREADS: str | None = None
8080
VLLM_USE_PRECOMPILED: bool = False
81+
VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
8182
VLLM_DOCKER_BUILD_CONTEXT: bool = False
8283
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
8384
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -462,6 +463,10 @@ def get_vllm_port() -> int | None:
462463
.lower()
463464
in ("1", "true")
464465
or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
466+
# If set, skip adding +precompiled suffix to version string
467+
"VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
468+
int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
469+
),
465470
# Used to mark that setup.py is running in a Docker build context,
466471
# in order to force the use of precompiled binaries.
467472
"VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")

0 commit comments

Comments
 (0)