Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ci/docker/common/install_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ install_pip_dependencies() {
pip_install -r /opt/conda/requirements-dev.txt
pip_install -r /opt/conda/requirements.txt
pip_install -r /opt/conda/requirements-flux.txt
pip_install -r /opt/conda/requirements-vlm.txt
popd
}

Expand Down
3 changes: 2 additions & 1 deletion .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
COPY requirements-dev.txt /opt/conda/
COPY requirements.txt /opt/conda/
COPY requirements-flux.txt /opt/conda/
COPY requirements-vlm.txt /opt/conda/
COPY conda-env-ci.txt /opt/conda/
COPY ./common/install_conda.sh install_conda.sh
COPY ./common/utils.sh utils.sh
RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/requirements-vlm.txt /opt/conda/conda-env-ci.txt

USER ci-user
CMD ["bash"]
4 changes: 1 addition & 3 deletions .github/workflows/integration_test_8gpu_flux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ on:
pull_request:
paths:
- 'torchtitan/experiments/flux/**'
schedule:
# Runs every 6 hours
- cron: '0 */6 * * *'

concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/integration_test_8gpu_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
paths-ignore:
- 'torchtitan/experiments/**'
pull_request:
branches: [ main ]
paths-ignore:
- 'torchtitan/experiments/**'
schedule:
Expand Down
50 changes: 50 additions & 0 deletions .github/workflows/integration_test_8gpu_vlm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: 8 GPU Vision Language Model Tests

on:
push:
branches: [ main ]
paths:
- 'torchtitan/experiments/vlm/**'
pull_request:
paths:
- 'torchtitan/experiments/vlm/**'

concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.g5.48xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
repository: pytorch/torchtitan
upload-artifact: outputs
script: |
set -eux

# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"

pip config --user set global.progress_bar off

python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126

USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126

mkdir artifacts-to-be-uploaded
python -m torchtitan.experiments.vlm.tests.integration_tests artifacts-to-be-uploaded --ngpu 4
2 changes: 1 addition & 1 deletion tests/integration_tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def main():
"--test_suite",
default="features",
choices=["features", "models", "h100"],
help="Which test suite to run. If not specified, torchtitan composibility tests will be run",
help="Which test suite to run. If not specified, torchtitan composability tests will be run",
)
parser.add_argument(
"--config_path",
Expand Down
8 changes: 8 additions & 0 deletions torchtitan/experiments/vlm/datasets/mm_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,14 @@ def _process_cc12_wd_sample(
loader=lambda path: load_dataset(path, split="train", streaming=True),
sample_processor=_process_cc12_wd_sample,
),
"cc12m-test": DatasetConfig(
# TODO: move test cc12m dataset to core test folder
path="torchtitan/experiments/flux/tests/assets/cc12m_test",
loader=lambda path: load_dataset(
path, split="train", data_files={"train": "*.tar"}, streaming=True
),
sample_processor=_process_cc12_wd_sample,
),
}


Expand Down
1 change: 1 addition & 0 deletions torchtitan/experiments/vlm/requirements-vlm.txt
71 changes: 71 additions & 0 deletions torchtitan/experiments/vlm/tests/integration_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import os

from tests.integration_tests import OverrideDefinitions
from tests.integration_tests.run_tests import run_tests


def build_vlm_test_list() -> list[OverrideDefinitions]:
"""
key is the config file name and value is a list of OverrideDefinitions
that is used to generate variations of integration tests based on the
same root config file.
"""
integration_tests_flavors = [
OverrideDefinitions(
[
[
"--experimental.custom_args_module torchtitan.experiments.vlm.assets.job_config",
"--model.name vlm",
"--training.dataset cc12m-test",
"--parallelism.data_parallel_shard_degree 4",
"--data.max_patches_per_image 1024",
"--data.max_images_per_batch 64",
],
],
"VLM FSDP",
"vlm_fsdp",
ngpu=4,
),
]
return integration_tests_flavors


_TEST_SUITES_FUNCTION = {
"vlm": build_vlm_test_list,
}


def main():
parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
parser.add_argument(
"--config_path",
default="./tests/integration_tests/base_config.toml",
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
)
parser.add_argument(
"--test_name",
default="all",
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
)
parser.add_argument("--ngpu", default=8, type=int)
args = parser.parse_args()

if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if os.listdir(args.output_dir):
raise RuntimeError("Please provide an empty output directory.")

test_list = _TEST_SUITES_FUNCTION["vlm"]()
run_tests(args, test_list)


if __name__ == "__main__":
main()