diff --git a/ai-ml/hotswap-hero-train-job/deploy.sh b/ai-ml/hotswap-hero-train-job/deploy.sh
new file mode 100755
index 0000000000..95b35c6481
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/deploy.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+# Set up env variables values
+
+# export PROJECT_ID=
+
+export REGION=us-west4
+export TPU_NODE_LOCATION=us-west4-a
+
+
+PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
+
+gcloud services enable container.googleapis.com \
+    --project=$PROJECT_ID 
+
+# Create terraform.tfvars file 
+cat <<EOF >gke-platform/terraform.tfvars
+project_id                  = "$PROJECT_ID"
+enable_autopilot            = false
+enable_tpu                  = true
+region                      = "$REGION"
+tpu_node_location           = ["$TPU_NODE_LOCATION"]
+tpu_machine_type            = "ct5lp-hightpu-4t"
+tpu_topology                 = "2x4"
+tpu_node_pools_number       = 3
+EOF
+
+# Create clusters
+terraform -chdir=gke-platform init 
+terraform -chdir=gke-platform apply 
+
+# Get cluster credentials
+gcloud container clusters get-credentials llm-cluster \
+    --region=$REGION \
+    --project=$PROJECT_ID
+
+# Install JobSets
+kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.7.0/manifests.yaml
+
+sleep 60 # wait for jobset to install
+kubectl create -f workloads/priority.yaml
+
+kubectl create -f workloads/high-priority-job.yaml
+kubectl create -f workloads/low-priority-job.yaml
+
+
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/main.tf b/ai-ml/hotswap-hero-train-job/gke-platform/main.tf
new file mode 100644
index 0000000000..d61e9101e2
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/main.tf
@@ -0,0 +1,73 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+provider "google" {
+  project = var.project_id
+}
+
+provider "google-beta" {
+  project = var.project_id
+}
+
+resource "google_service_account" "service_account" {
+  account_id   = "gke-llm-sa"
+  display_name = "LLM clusters Service Account"
+}
+
+# Grant permissions to write metrics for monitoring purposes
+resource "google_project_iam_member" "project" {
+  project = var.project_id
+  role    = "roles/monitoring.metricWriter"
+  member  = "serviceAccount:${google_service_account.service_account.email}"
+}
+
+resource "google_project_iam_member" "logs_writer" {
+  project = var.project_id
+  role    = "roles/logging.logWriter"
+  member  = "serviceAccount:${google_service_account.service_account.email}"
+}
+
+module "gke_autopilot" {
+  source = "./modules/gke_autopilot"
+
+  project_id       = var.project_id
+  region           = var.region
+  cluster_name     = var.cluster_name
+  cluster_labels   = var.cluster_labels
+  enable_autopilot = var.enable_autopilot
+  service_account  = google_service_account.service_account.email
+  enable_fleet     = var.enable_fleet
+  fleet_project_id = var.fleet_project_id
+}
+
+
+
+module "gke_standard" {
+  source = "./modules/gke_standard"
+
+  project_id            = var.project_id
+  region                = var.region
+  cluster_name          = var.cluster_name
+  cluster_labels        = var.cluster_labels
+  enable_autopilot      = var.enable_autopilot
+  enable_tpu            = var.enable_tpu
+  tpu_node_location     = var.tpu_node_location
+  service_account       = google_service_account.service_account.email
+  enable_fleet          = var.enable_fleet
+  fleet_project_id      = var.fleet_project_id
+  gateway_api_channel   = var.gateway_api_channel
+  tpu_machine_type      = var.tpu_machine_type
+  tpu_node_pools_number = var.tpu_node_pools_number
+  tpu_topology          = var.tpu_topology
+}
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/main.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/main.tf
new file mode 100644
index 0000000000..b1f1c9096e
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/main.tf
@@ -0,0 +1,96 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+provider "google" {
+  project = var.project_id
+  region  = var.region
+}
+
+data "google_service_account" "default" {
+  account_id = var.service_account
+}
+
+# GKE cluster
+resource "google_container_cluster" "ml_cluster" {
+  name     = var.cluster_name
+  location = var.region
+  count    = var.enable_autopilot == true ? 1 : 0
+
+  deletion_protection = false
+
+  initial_node_count = 1
+
+  logging_config {
+    enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"]
+  }
+  node_config {
+    # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles.
+    service_account = data.google_service_account.default.email
+    oauth_scopes = [
+              "https://www.googleapis.com/auth/devstorage.read_only",
+              "https://www.googleapis.com/auth/logging.write",
+              "https://www.googleapis.com/auth/monitoring",
+              "https://www.googleapis.com/auth/service.management.readonly",
+              "https://www.googleapis.com/auth/servicecontrol",
+              "https://www.googleapis.com/auth/trace.append",
+    ]
+    reservation_affinity {
+      consume_reservation_type = "NO_RESERVATION"
+    }
+    gvnic {
+      enabled = true
+    }
+  }
+  cluster_autoscaling {
+    auto_provisioning_defaults {
+      service_account = data.google_service_account.default.email
+      oauth_scopes = [
+              "https://www.googleapis.com/auth/devstorage.read_only",
+              "https://www.googleapis.com/auth/logging.write",
+              "https://www.googleapis.com/auth/monitoring",
+              "https://www.googleapis.com/auth/service.management.readonly",
+              "https://www.googleapis.com/auth/servicecontrol",
+              "https://www.googleapis.com/auth/trace.append",
+      ]
+    }
+  }
+  monitoring_config {
+    enable_components = ["SYSTEM_COMPONENTS"]
+    managed_prometheus {
+      enabled = "true"
+    }
+  }
+
+  dynamic "fleet" {
+    for_each = var.enable_fleet ? [1] : []
+    content {
+      project = var.fleet_project_id
+    }
+  }
+
+  ip_allocation_policy {
+    cluster_ipv4_cidr_block  = ""
+    services_ipv4_cidr_block = ""
+  }
+
+  enable_autopilot = true
+
+  release_channel {
+    channel = "RAPID"
+  }
+
+  min_master_version = "1.31"
+
+  resource_labels = var.cluster_labels
+}
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/output.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/output.tf
new file mode 100644
index 0000000000..03cb760b7c
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/output.tf
@@ -0,0 +1,38 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+output "project_id" {
+  description = "GCP project id"
+  value       = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].project : null
+}
+
+output "region" {
+  description = "GCP region"
+  value       = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].location : null
+}
+
+output "cluster_name" {
+  description = "The name of the GKE cluster"
+  value       = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].name : null
+}
+
+output "kubernetes_host" {
+  description = "Kubernetes cluster host"
+  value       = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].endpoint : null
+}
+
+output "cluster_certificate" {
+  description = "Kubernetes cluster CA certificate"
+  value       = var.enable_autopilot ? base64decode(resource.google_container_cluster.ml_cluster[0].master_auth[0].cluster_ca_certificate) : null
+}
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/variables.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/variables.tf
new file mode 100644
index 0000000000..8f4d70bbe5
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/variables.tf
@@ -0,0 +1,64 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "project_id" {
+  type        = string
+  description = "GCP project id"
+  default     = null
+}
+
+variable "region" {
+  type        = string
+  description = "GCP project region or zone"
+  default     = "us-central1"
+}
+
+variable "cluster_name" {
+  type        = string
+  description = "GKE cluster name"
+  default     = "ml-cluster"
+}
+
+variable "cluster_labels" {
+  type        = map(any)
+  description = "GKE cluster labels"
+  default = {
+    created-by = "ai-on-gke"
+  }
+}
+
+variable "num_gpu_nodes" {
+  description = "Number of GPU nodes in the cluster"
+  default     = 1
+}
+
+variable "enable_autopilot" {
+  type        = bool
+  description = "Set to true to enable GKE Autopilot clusters"
+  default     = false
+}
+
+variable "service_account" {
+  type = string
+}
+
+variable "enable_fleet" {
+  type    = bool
+  default = false
+}
+
+variable "fleet_project_id" {
+  type    = string
+  default = ""
+}
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/main.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/main.tf
new file mode 100644
index 0000000000..0c5116642f
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/main.tf
@@ -0,0 +1,174 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+provider "google" {
+  project = var.project_id
+  region  = var.region
+}
+
+provider "google-beta" {
+  project = var.project_id
+  region  = var.region
+}
+
+locals {
+  gateway_api_config = var.gateway_api_channel != null ? [{ channel : var.gateway_api_channel }] : []
+}
+
+data "google_service_account" "default" {
+  account_id = var.service_account
+}
+
+# GKE cluster
+resource "google_container_cluster" "ml_cluster" {
+  name                     = var.cluster_name
+  location                 = var.region
+  count                    = var.enable_autopilot == false ? 1 : 0
+  remove_default_node_pool = true
+  initial_node_count       = 1
+  min_master_version       = "1.31"
+
+  deletion_protection      = false
+
+  node_config {
+    service_account = data.google_service_account.default.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/devstorage.read_only",
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/service.management.readonly",
+      "https://www.googleapis.com/auth/servicecontrol",
+      "https://www.googleapis.com/auth/trace.append",
+    ]
+  }
+
+  logging_config {
+    enable_components = [
+      "APISERVER",
+      "CONTROLLER_MANAGER",
+      "SCHEDULER",
+      "SYSTEM_COMPONENTS",
+      "WORKLOADS"
+    ]
+  }
+
+  monitoring_config {
+    enable_components = ["SYSTEM_COMPONENTS"]
+    managed_prometheus {
+      enabled = "true"
+    }
+  }
+  dynamic "fleet" {
+    for_each = var.enable_fleet ? [1] : []
+    content {
+      project = var.fleet_project_id
+    }
+  }
+
+  dynamic "gateway_api_config" {
+    for_each = local.gateway_api_config
+
+    content {
+      channel = gateway_api_config.value.channel
+    }
+  }
+
+  workload_identity_config {
+    workload_pool = "${var.project_id}.svc.id.goog"
+  }
+
+
+  release_channel {
+    channel = "RAPID"
+  }
+
+  resource_labels = var.cluster_labels
+
+  addons_config {
+    gcp_filestore_csi_driver_config {
+      enabled = true
+    }
+
+    gcs_fuse_csi_driver_config {
+      enabled = true
+    }
+
+    gce_persistent_disk_csi_driver_config {
+      enabled = true
+    }
+  }
+}
+
+
+resource "google_container_node_pool" "cpu_pool" {
+  name     = "cpu-pool"
+  location = var.region
+  count    = var.enable_autopilot ? 0 : 1
+  cluster  = var.enable_autopilot ? null : google_container_cluster.ml_cluster[0].name
+
+  autoscaling {
+    min_node_count = 1
+    max_node_count = 3
+  }
+
+  management {
+    auto_repair  = "true"
+    auto_upgrade = "true"
+  }
+
+  node_config {
+    machine_type    = "n1-standard-4"
+    service_account = data.google_service_account.default.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+      "https://www.googleapis.com/auth/trace.append",
+      "https://www.googleapis.com/auth/service.management.readonly",
+      "https://www.googleapis.com/auth/servicecontrol",
+    ]
+  }
+}
+
+resource "google_container_node_pool" "tpu_pool" {
+    provider           = google-beta
+    name               = "tpu-pool-${count.index}"
+    location           = var.region
+    node_locations     = var.tpu_node_location
+    cluster            = var.enable_autopilot == false && var.enable_tpu ? google_container_cluster.ml_cluster[0].name : null
+    initial_node_count = var.num_nodes
+    count              = var.enable_autopilot == false && var.enable_tpu ? var.tpu_node_pools_number : 0
+    
+    autoscaling {
+      min_node_count = "0"
+      max_node_count = "2"
+    }
+    
+    node_config {
+      machine_type = var.tpu_machine_type
+      oauth_scopes = [
+        "https://www.googleapis.com/auth/logging.write",
+        "https://www.googleapis.com/auth/monitoring",
+        "https://www.googleapis.com/auth/devstorage.read_only",
+        "https://www.googleapis.com/auth/trace.append",
+        "https://www.googleapis.com/auth/service.management.readonly",
+        "https://www.googleapis.com/auth/servicecontrol",
+      ]
+      spot = true
+    }
+    placement_policy {
+      type = "COMPACT"
+      tpu_topology = var.tpu_topology
+    }
+}
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/output.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/output.tf
new file mode 100644
index 0000000000..1df05e2f8b
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/output.tf
@@ -0,0 +1,38 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+output "project_id" {
+  description = "GCP project id"
+  value       = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].project
+}
+
+output "region" {
+  description = "GCP region"
+  value       = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].location
+}
+
+output "cluster_name" {
+  description = "The name of the GKE cluster"
+  value       = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].name
+}
+
+output "kubernetes_host" {
+  description = "Kubernetes cluster host"
+  value       = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].endpoint
+}
+
+output "cluster_certificate" {
+  description = "Kubernetes cluster CA certificate"
+  value       = var.enable_autopilot ? null : base64decode(resource.google_container_cluster.ml_cluster[0].master_auth[0].cluster_ca_certificate)
+}
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/variables.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/variables.tf
new file mode 100644
index 0000000000..0a3e095926
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/variables.tf
@@ -0,0 +1,104 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "project_id" {
+  type        = string
+  description = "GCP project id"
+  default     = null
+}
+
+variable "region" {
+  type        = string
+  description = "GCP project region or zone"
+  default     = "us-central1"
+}
+
+variable "cluster_name" {
+  type        = string
+  description = "GKE cluster name"
+  default     = "ml-cluster"
+}
+
+variable "cluster_labels" {
+  type        = map(any)
+  description = "GKE cluster labels"
+  default = {
+    created-by = "ai-on-gke"
+  }
+}
+
+variable "num_nodes" {
+  description = "Number of GPU nodes in the cluster"
+  default     = 0
+}
+
+variable "enable_autopilot" {
+  type        = bool
+  description = "Set to true to enable GKE Autopilot clusters"
+  default     = false
+}
+
+variable "enable_tpu" {
+  type        = bool
+  description = "Set to true to create TPU node pool"
+  default     = false
+}
+
+variable "service_account" {
+  type = string
+}
+
+variable "enable_fleet" {
+  type    = bool
+  default = false
+}
+
+variable "fleet_project_id" {
+  type    = string
+  default = ""
+}
+
+variable "gateway_api_channel" {
+  type        = string
+  description = "The gateway api channel of this cluster. Accepted values are `CHANNEL_STANDARD` and `CHANNEL_DISABLED`."
+  default     = null
+}
+
+variable "gpu_driver_version" {
+  type        = string
+  description = "the NVIDIA driver version to install"
+  default     = "DEFAULT"
+}
+
+variable "tpu_node_location" {
+  type        = set(string)
+  description = "Location for tpu nodes"
+  default     = []
+}
+
+variable "tpu_machine_type" {
+  type = string
+  description = "Machine type for TPU node pool"
+  default = ""
+}
+variable "tpu_topology" {
+  type = string
+  description = "Topology for TPU node pool"
+  default = "1x1"
+}
+
+variable "tpu_node_pools_number" {
+  description = "Number of TPU node pools. "
+  default = 1
+}
\ No newline at end of file
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/variables.tf b/ai-ml/hotswap-hero-train-job/gke-platform/variables.tf
new file mode 100644
index 0000000000..ef0ecd465e
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/variables.tf
@@ -0,0 +1,92 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "project_id" {
+  type        = string
+  description = "GCP project id"
+  default     = "<your project>"
+}
+
+variable "region" {
+  type        = string
+  description = "GCP project region or zone"
+  default     = "us-central1"
+}
+
+
+variable "cluster_name" {
+  type        = string
+  description = "GKE cluster name"
+  default     = "llm-cluster"
+}
+
+
+variable "cluster_labels" {
+  type        = map(any)
+  description = "GKE cluster labels"
+  default = {
+    created-by = "ai-on-gke"
+  }
+}
+
+variable "enable_autopilot" {
+  type        = bool
+  description = "Set to true to enable GKE Autopilot clusters"
+  default     = false
+}
+
+variable "enable_fleet" {
+  type    = bool
+  default = false
+}
+
+variable "fleet_project_id" {
+  type    = string
+  default = ""
+}
+
+variable "gateway_api_channel" {
+  type        = string
+  description = "The gateway api channel of this cluster. Accepted values are `CHANNEL_STANDARD` and `CHANNEL_DISABLED`."
+  default     = null
+}
+
+variable "enable_tpu" {
+  type        = bool
+  description = "Set to true to create TPU node pool"
+  default     = false
+}
+
+variable "tpu_node_location" {
+  type        = set(string)
+  description = "Location for tpu nodes"
+  default     = []
+}
+
+variable "tpu_machine_type" {
+  type        = string
+  description = "Machine type for TPU node pool in standard GKE cluster."
+  default     = ""
+}
+
+variable "tpu_topology" {
+  type        = string
+  description = "Topology for standard GKE cluster TPU node pool"
+  default     = "1x1"
+}
+
+variable "tpu_node_pools_number" {
+  description = "Number of TPU node pools in standard GKE cluster."
+  default     = 1
+}
\ No newline at end of file
diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/versions.tf b/ai-ml/hotswap-hero-train-job/gke-platform/versions.tf
new file mode 100644
index 0000000000..43ae9616aa
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/gke-platform/versions.tf
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform {
+  required_providers {
+    google = {
+      source  = "hashicorp/google"
+      version = "~> 6.1"
+    }
+    google-beta = {
+      source  = "hashicorp/google-beta"
+      version = "~> 6.1"
+    }
+  }
+}
diff --git a/ai-ml/hotswap-hero-train-job/remove.sh b/ai-ml/hotswap-hero-train-job/remove.sh
new file mode 100755
index 0000000000..36e6e3159c
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/remove.sh
@@ -0,0 +1,5 @@
+#!/bin/zsh
+
+cd gke-platform
+terraform destroy --auto-approve
+cd -
diff --git a/ai-ml/hotswap-hero-train-job/workloads/high-priority-job-autopilot.yaml b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job-autopilot.yaml
new file mode 100644
index 0000000000..1b2c119ff4
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job-autopilot.yaml
@@ -0,0 +1,66 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_aiml_hotswap_hero_train_job_workloads_high_priority_job_autopilot]
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+ name: high-priority
+ annotations:
+   alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
+spec:
+ failurePolicy:
+   maxRestarts: 100
+ replicatedJobs:
+ - name: job
+   replicas: 2
+   template:
+     spec:
+       parallelism: 2
+       completions: 4
+       backoffLimit: 0
+       template:
+         metadata:
+           labels:
+             priority: high-priority
+         spec:
+           nodeSelector:
+             cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+             cloud.google.com/gke-tpu-topology: 2x4
+             cloud.google.com/gke-spot: "true"
+           priorityClassName: high-prior-job
+           containers:
+           - name: jax-tpu
+             image: python:3.8
+             ports:
+             - containerPort: 8471
+             - containerPort: 8080
+             - containerPort: 8431
+             command:
+             - bash
+             - -c
+             - |
+               pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+               python -c 'import jax; print("Global device count:", jax.device_count())'
+               sleep 60000
+             resources:
+               requests:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+               limits:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+# [END gke_aiml_hotswap_hero_train_job_workloads_high_priority_job_autopilot]
\ No newline at end of file
diff --git a/ai-ml/hotswap-hero-train-job/workloads/high-priority-job.yaml b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job.yaml
new file mode 100644
index 0000000000..d8ef42ce38
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job.yaml
@@ -0,0 +1,69 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_aiml_hotswap_hero_train_job_workloads_high_priority_job]  
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+ name: high-priority
+ annotations:
+   alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
+spec:
+ failurePolicy:
+   maxRestarts: 100
+ replicatedJobs:
+ - name: job
+   replicas: 2
+   template:
+     spec:
+       parallelism: 2
+       completions: 4
+       backoffLimit: 0
+       template:
+         metadata:
+           labels:
+             priority: high-priority
+         spec:
+           hostNetwork: true
+           dnsPolicy: ClusterFirstWithHostNet
+           nodeSelector:
+             cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+             cloud.google.com/gke-tpu-topology: 2x4
+           priorityClassName: high-prior-job
+           containers:
+           - name: jax-tpu
+             image: python:3.8
+             ports:
+             - containerPort: 8471
+             - containerPort: 8080
+             - containerPort: 8431
+             securityContext:
+               privileged: true
+             command:
+             - bash
+             - -c
+             - |
+               pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+               python -c 'import jax; print("Global device count:", jax.device_count())'
+               sleep 60000
+             resources:
+               requests:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+               limits:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+# [END gke_aiml_hotswap_hero_train_job_workloads_high_priority_job]
diff --git a/ai-ml/hotswap-hero-train-job/workloads/low-priority-job-autopilot.yaml b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job-autopilot.yaml
new file mode 100644
index 0000000000..c3e7bf1c6a
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job-autopilot.yaml
@@ -0,0 +1,67 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_aiml_hotswap_hero_train_job_workloads_low_priority_job_autopilot]
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+ name: low-priority
+ annotations:
+   alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
+spec:
+ failurePolicy:
+   maxRestarts: 100
+ replicatedJobs:
+ - name: job
+   replicas: 1
+   template:
+     spec:
+       parallelism: 2
+       completions: 4
+       backoffLimit: 0
+       template:
+         metadata:
+           labels:
+             priority: low-priority
+         spec:
+           nodeSelector:
+             cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+             cloud.google.com/gke-tpu-topology: 2x4
+             cloud.google.com/gke-spot: "true"
+           priorityClassName: low-prior-job
+           terminationGracePeriodSeconds: 0
+           containers:
+           - name: jax-tpu
+             image: python:3.8
+             ports:
+             - containerPort: 8471
+             - containerPort: 8080
+             - containerPort: 8431
+             command:
+             - bash
+             - -c
+             - |
+               pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+               python -c 'import jax; print("Global device count:", jax.device_count())'
+               sleep 60000
+             resources:
+               requests:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+               limits:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+# [END gke_aiml_hotswap_hero_train_job_workloads_low_priority_job_autopilot]
diff --git a/ai-ml/hotswap-hero-train-job/workloads/low-priority-job.yaml b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job.yaml
new file mode 100644
index 0000000000..98989f823b
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job.yaml
@@ -0,0 +1,69 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# [START gke_aiml_hotswap_hero_train_job_workloads_low_priority_job]
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+ name: low-priority
+ annotations:
+   alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
+spec:
+ failurePolicy:
+   maxRestarts: 100
+ replicatedJobs:
+ - name: job
+   replicas: 1
+   template:
+     spec:
+       parallelism: 2
+       completions: 4
+       backoffLimit: 0
+       template:
+         metadata:
+           labels:
+             priority: low-priority
+         spec:
+           hostNetwork: true
+           dnsPolicy: ClusterFirstWithHostNet
+           nodeSelector:
+             cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+             cloud.google.com/gke-tpu-topology: 2x4
+           priorityClassName: low-prior-job
+           terminationGracePeriodSeconds: 0
+           containers:
+           - name: jax-tpu
+             image: python:3.8
+             ports:
+             - containerPort: 8471
+             - containerPort: 8080
+             - containerPort: 8431
+             securityContext:
+               privileged: true
+             command:
+             - bash
+             - -c
+             - |
+               pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+               python -c 'import jax; print("Global device count:", jax.device_count())'
+               sleep 60000
+             resources:
+               requests:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+               limits:
+                 cpu: 10
+                 memory: 50Gi
+                 google.com/tpu: 4
+# [END gke_aiml_hotswap_hero_train_job_workloads_low_priority_job]
diff --git a/ai-ml/hotswap-hero-train-job/workloads/priority.yaml b/ai-ml/hotswap-hero-train-job/workloads/priority.yaml
new file mode 100644
index 0000000000..19cf2475a6
--- /dev/null
+++ b/ai-ml/hotswap-hero-train-job/workloads/priority.yaml
@@ -0,0 +1,31 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_aiml_hotswap_hero_train_job_workloads_priority]
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+ name: high-prior-job
+value: 2000000
+globalDefault: false
+description: "This priority class should be used for hero pods only."
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+ name: low-prior-job
+value: 1000000
+globalDefault: false
+description: "This priority class should be used for low priority pods only."
+# [END gke_aiml_hotswap_hero_train_job_workloads_priority]
\ No newline at end of file