pytorch · IvanKobzarev · Jul 12, 2025 · Jul 13, 2025 · Jul 14, 2025 · Jul 14, 2025
@@ -7,3 +7,4 @@ wandb
 fsspec
 tyro
 tokenizers >= 0.15.0
+safetensors
@@ -0,0 +1,11 @@
+version_file="assets/version.txt"
+init_file="torchtitan/__init__.py"
+if [[ -n "$BUILD_VERSION" ]]; then
+    # Update the version in version.txt
+    echo "$BUILD_VERSION" > "$version_file"
+    # Create a variable named __version__ at the end of __init__.py
+    echo "__version__ = \"$BUILD_VERSION\"" >> "$init_file"
+else
+    echo "Error: BUILD_VERSION environment variable is not set or empty."
+    exit 1
+fi
@@ -0,0 +1,40 @@
+name: Build nightly wheels and publish to PyTorch Index
+
+on:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  generate-matrix:
+    if: github.repository_owner == 'pytorch'
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: enable
+      with-rocm: enable
+      python-versions: '["3.10", "3.11", "3.12"]'
+  build:
+    needs: generate-matrix
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      repository: pytorch/torchtitan
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      package-name: torchtitan
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: .github/scripts/update_version.sh
+      trigger-event: ${{ github.event_name }}
+      build-platform: 'python-build-package'
@@ -39,11 +39,15 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
+        python -m tests.integration_tests artifacts-to-be-uploaded --ngpu 8
@@ -41,6 +41,10 @@ jobs:
 
         pip config --user set global.progress_bar off
 
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded

@@ -40,11 +40,17 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
+
+        # Enable CPP stacktraces for debugging symmetric memory initialization errors.
+        TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests_h100 artifacts-to-be-uploaded --ngpu 8
@@ -38,6 +38,10 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126

@@ -0,0 +1,57 @@
+name: TorchFT 8 GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/components/ft.py'
+  pull_request:
+    paths:
+      - 'torchtitan/components/ft.py'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install torchft-nightly
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        echo "torchft_lighthouse"
+        RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 &
+        echo "ft_integration_test"
+        # Getting error - Cuda failure 217 'peer access is not supported between these two devices'
+        python -m tests.integration_tests_ft artifacts-to-be-uploaded --ngpu 8
+        # pkill -9 torchft_lighthouse
@@ -15,10 +15,9 @@ wandb
 
 torchtitan/datasets/**/*.model
 
-# tokenizer models
-assets/**/*.model
-assets/**/*.json
-assets/**/*.txt
+# hf assets
+assets/hf/*
+assets/tokenizer/*
 torchtitan/experiments/flux/assets/*
 
 # temp files

@@ -51,7 +51,7 @@ Note: To accelerate contributions to and innovations around `torchtitan`, we are
   - After the model change, it should still load the original checkpoint correctly.
   - Document the reasons for the code change, similar to [composability.md](docs/composability.md).
 - Keep code modularized, especially for [train.py](train.py), so that it remains easy to copy-paste into a minimal code example. If necessary:
-  - Introduce new config options/category in [config_manager.py](torchtitan/config_manager.py).
+  - Introduce new config options/category in [job_config.py](torchtitan/config/job_config.py).
   - Create separate functions/files.
 
 ### Proof of Value

@@ -6,9 +6,12 @@
 
 [![integration tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
 [![arXiv](https://img.shields.io/badge/arXiv-2410.06511-b31b1b.svg)](https://arxiv.org/abs/2410.06511)
-[![ICLR](https://img.shields.io/badge/ICLR-2025-blue.svg)](https://iclr.cc/virtual/2025/poster/29620)
+[![ICLR](https://img.shields.io/badge/ICLR-2025-violet.svg)](https://iclr.cc/virtual/2025/poster/29620)
 [![forum](https://img.shields.io/badge/pytorch-forum-DE3412.svg)](https://discuss.pytorch.org/c/distributed/torchtitan/44)
 [![license](https://img.shields.io/badge/license-BSD_3--Clause-lightgrey.svg)](./LICENSE)
+[![pip](https://img.shields.io/pypi/v/torchtitan?color=blue)](https://pypi.org/project/torchtitan/)
+[![conda](https://img.shields.io/conda/vn/conda-forge/torchtitan?color=green)](https://anaconda.org/conda-forge/torchtitan)
+
 
 </div>
 
@@ -17,6 +20,8 @@ To use the latest features of `torchtitan`, we recommend using the most recent P
 
 
 ## Latest News
+- [2025/07] We published [instructions](/torchtitan/models/README.md) on how to add a model to `torchtitan`.
+- [2025/07] We released `torchtitan` [v0.1.0](https://github.com/pytorch/torchtitan/releases), and also set up nightly builds.
 - [2025/04] Our paper was accepted by [ICLR 2025](https://iclr.cc/virtual/2025/poster/29620).
 - [2025/04] [Llama 4](torchtitan/experiments/llama4/) initial support is available as an experiment.
 - [2025/04] Training the diffusion model [FLUX](torchtitan/experiments/flux/) with FSDP/HSDP is available as an experiment.
@@ -33,7 +38,7 @@ To use the latest features of `torchtitan`, we recommend using the most recent P
 
 Our mission is to accelerate innovation in the field of generative AI by empowering researchers and developers to explore new modeling architectures and infrastructure techniques.
 
-The guiding principles when building `torchtitan`
+The Guiding Principles when building `torchtitan`
 * Designed to be easy to understand, use and extend for different training purposes.
 * Minimal changes to the model code when applying multi-dimensional parallelism.
 * Bias towards a clean, minimal codebase while providing basic reusable / swappable components.
@@ -86,25 +91,48 @@ You may want to see how the model is defined or how parallelism techniques are a
 
 ## Installation
 
+One can choose to install `torchtitan` from a stable release, a nightly build, or directly run the source code. Please [install PyTorch](https://pytorch.org/get-started/locally/) before proceeding.
+
+### Stable releases
+One can install the latest [stable release](https://github.com/pytorch/torchtitan/releases) of `torchtitan` via `pip` or `conda`.
+```sh
+pip install torchtitan
+```
+```sh
+conda install conda-forge::torchtitan
+```
+Note that each stable release pins the nightly versions of `torch` and `torchao`. Please see [release.md](docs/release.md) for more details.
+
+### Nightly builds
+
+This method requires the nightly build of PyTorch. You can replace `cu126` with another version of cuda (e.g. `cu128`) or an AMD GPU (e.g. `rocm6.3`).
+
+```sh
+pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall
+pip install --pre torchtitan --index-url https://download.pytorch.org/whl/nightly/cu126
+```
+
+### From source
+
+This method requires the nightly build of PyTorch or the latest PyTorch built [from source](https://github.com/pytorch/pytorch?tab=readme-ov-file#from-source).
+
 ```bash
 git clone https://github.com/pytorch/torchtitan
 cd torchtitan
 pip install -r requirements.txt
-pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall
-[For AMD GPU] pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall
 ```
 
 ### Downloading a tokenizer
 
-`torchtitan` currently supports training Llama 3.1 (8B, 70B, 405B) out of the box. To get started training these models, we need to download a tokenizer.model. Follow the instructions on the official [meta-llama](https://huggingface.co/meta-llama/Llama-3.1-8B) repository to ensure you have access to the Llama model weights.
+`torchtitan` currently supports training Llama 3.1 (8B, 70B, 405B) out of the box. To get started training these models, we need to download the tokenizer. Follow the instructions on the official [meta-llama](https://huggingface.co/meta-llama/Llama-3.1-8B) repository to ensure you have access to the Llama model weights.
 
 Once you have confirmed access, you can run the following command to download the Llama 3.1 tokenizer to your local machine.
 
 ```bash
 # Get your HF token from https://huggingface.co/settings/tokens
 
 # Llama 3.1 tokenizer
-python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B --hf_token=...
+python scripts/download_hf_assets.py --repo_id meta-llama/Llama-3.1-8B --assets tokenizer --hf_token=...
 ```
 
 ### Start a training run

@@ -0,0 +1,54 @@
+This was performed by Trainy team on WhiteFiber in June 2025, to get a baseline of performance
+of the Trainy platform on H200s platform over multiple hosts.
+
+### Models
+
+Llama 3.1 8B
+
+### Hardware
+
+Each host has
+
+- 8 NVIDIA H200 GPUs connected via NVLink.
+- Hosts are inter-connected with a backend RDMA fabric with 400Gb/s (Mellanox CX-7) per GPU.
+
+### Configuration
+
+Runs were invoked with the following, where `NUM_NODES` was `4` and `8`
+```
+  torchrun \
+    --nnodes $NUM_NODES  \
+    --nproc_per_node 8 \
+    --rdzv_id 101 \
+    --rdzv_backend c10d \
+    --rdzv_endpoint "$MASTER_ADDR:29500" \
+    torchtitan/train.py \
+    --job.config-file torchtitan/models/llama3/train_configs/llama3_8b.toml \
+    --metrics.enable_wandb \
+    --training.local_batch_size=2 \
+    --training.compile \
+    --model.converters="float8" \
+    --float8.enable_fsdp_float8_all_gather \
+    --float8.precompute_float8_dynamic_scale_for_fsdp \
+    --float8.force_recompute_fp8_weight_in_bwd \
+    --profiling.profile_freq 1000000
+    --training.steps 2000
+```
+
+### Results
+
+Detailed performance results and training configurations can be found in the tables below along and can visualized in [this WandB report](https://api.wandb.ai/links/asaiacai/w4c46stp). `TPS` and `Memory(GiB)` are arbitrarily sampled at the 100th iteration:
+
+| NUM_NODES | TPS/GPU | Memory(GiB) |
+| ----- | ----: | ----: |
+| 4 | 10938 | 47.96 |
+| 8 | 10753 | 46.97 |
+
+
+### Versions and Dates
+
+| repo | commit | date |
+| --- | --- | --- |
+| torch | [2.8.0a0+5228986c39](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html) | 2025/05/29 |
+| torchao | [0afa4c1](https://github.com/pytorch/ao/commit/0afa4c1bd28c82921e360ddbd1b27c9d6da5b947) | 2025/06/13 |
+| torchtitan | [e7c0cae](https://github.com/pytorch/torchtitan/commit/e7c0cae934df78d6e9c2835f42ff1f757dc3fddc) | 2025/06/13 |
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ wandb @@
     fsspec
     tyro
     tokenizers >= 0.15.0
+    safetensors