diff --git a/ai-ml/hotswap-hero-train-job/deploy.sh b/ai-ml/hotswap-hero-train-job/deploy.sh new file mode 100755 index 0000000000..95b35c6481 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/deploy.sh @@ -0,0 +1,46 @@ +#!/bin/sh + +# Set up env variables values + +# export PROJECT_ID= + +export REGION=us-west4 +export TPU_NODE_LOCATION=us-west4-a + + +PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)") + +gcloud services enable container.googleapis.com \ + --project=$PROJECT_ID + +# Create terraform.tfvars file +cat <gke-platform/terraform.tfvars +project_id = "$PROJECT_ID" +enable_autopilot = false +enable_tpu = true +region = "$REGION" +tpu_node_location = ["$TPU_NODE_LOCATION"] +tpu_machine_type = "ct5lp-hightpu-4t" +tpu_topology = "2x4" +tpu_node_pools_number = 3 +EOF + +# Create clusters +terraform -chdir=gke-platform init +terraform -chdir=gke-platform apply + +# Get cluster credentials +gcloud container clusters get-credentials llm-cluster \ + --region=$REGION \ + --project=$PROJECT_ID + +# Install JobSets +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.7.0/manifests.yaml + +sleep 60 # wait for jobset to install +kubectl create -f workloads/priority.yaml + +kubectl create -f workloads/high-priority-job.yaml +kubectl create -f workloads/low-priority-job.yaml + + diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/main.tf b/ai-ml/hotswap-hero-train-job/gke-platform/main.tf new file mode 100644 index 0000000000..d61e9101e2 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/main.tf @@ -0,0 +1,73 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +provider "google" { + project = var.project_id +} + +provider "google-beta" { + project = var.project_id +} + +resource "google_service_account" "service_account" { + account_id = "gke-llm-sa" + display_name = "LLM clusters Service Account" +} + +# Grant permissions to write metrics for monitoring purposes +resource "google_project_iam_member" "project" { + project = var.project_id + role = "roles/monitoring.metricWriter" + member = "serviceAccount:${google_service_account.service_account.email}" +} + +resource "google_project_iam_member" "logs_writer" { + project = var.project_id + role = "roles/logging.logWriter" + member = "serviceAccount:${google_service_account.service_account.email}" +} + +module "gke_autopilot" { + source = "./modules/gke_autopilot" + + project_id = var.project_id + region = var.region + cluster_name = var.cluster_name + cluster_labels = var.cluster_labels + enable_autopilot = var.enable_autopilot + service_account = google_service_account.service_account.email + enable_fleet = var.enable_fleet + fleet_project_id = var.fleet_project_id +} + + + +module "gke_standard" { + source = "./modules/gke_standard" + + project_id = var.project_id + region = var.region + cluster_name = var.cluster_name + cluster_labels = var.cluster_labels + enable_autopilot = var.enable_autopilot + enable_tpu = var.enable_tpu + tpu_node_location = var.tpu_node_location + service_account = google_service_account.service_account.email + enable_fleet = var.enable_fleet + fleet_project_id = var.fleet_project_id + gateway_api_channel = var.gateway_api_channel + tpu_machine_type = var.tpu_machine_type + tpu_node_pools_number = var.tpu_node_pools_number + tpu_topology = var.tpu_topology +} diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/main.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/main.tf new file mode 100644 index 0000000000..b1f1c9096e --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/main.tf @@ -0,0 +1,96 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +provider "google" { + project = var.project_id + region = var.region +} + +data "google_service_account" "default" { + account_id = var.service_account +} + +# GKE cluster +resource "google_container_cluster" "ml_cluster" { + name = var.cluster_name + location = var.region + count = var.enable_autopilot == true ? 1 : 0 + + deletion_protection = false + + initial_node_count = 1 + + logging_config { + enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"] + } + node_config { + # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles. + service_account = data.google_service_account.default.email + oauth_scopes = [ + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/service.management.readonly", + "https://www.googleapis.com/auth/servicecontrol", + "https://www.googleapis.com/auth/trace.append", + ] + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + gvnic { + enabled = true + } + } + cluster_autoscaling { + auto_provisioning_defaults { + service_account = data.google_service_account.default.email + oauth_scopes = [ + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/service.management.readonly", + "https://www.googleapis.com/auth/servicecontrol", + "https://www.googleapis.com/auth/trace.append", + ] + } + } + monitoring_config { + enable_components = ["SYSTEM_COMPONENTS"] + managed_prometheus { + enabled = "true" + } + } + + dynamic "fleet" { + for_each = var.enable_fleet ? [1] : [] + content { + project = var.fleet_project_id + } + } + + ip_allocation_policy { + cluster_ipv4_cidr_block = "" + services_ipv4_cidr_block = "" + } + + enable_autopilot = true + + release_channel { + channel = "RAPID" + } + + min_master_version = "1.31" + + resource_labels = var.cluster_labels +} diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/output.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/output.tf new file mode 100644 index 0000000000..03cb760b7c --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/output.tf @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "project_id" { + description = "GCP project id" + value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].project : null +} + +output "region" { + description = "GCP region" + value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].location : null +} + +output "cluster_name" { + description = "The name of the GKE cluster" + value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].name : null +} + +output "kubernetes_host" { + description = "Kubernetes cluster host" + value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].endpoint : null +} + +output "cluster_certificate" { + description = "Kubernetes cluster CA certificate" + value = var.enable_autopilot ? base64decode(resource.google_container_cluster.ml_cluster[0].master_auth[0].cluster_ca_certificate) : null +} diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/variables.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/variables.tf new file mode 100644 index 0000000000..8f4d70bbe5 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_autopilot/variables.tf @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "GCP project id" + default = null +} + +variable "region" { + type = string + description = "GCP project region or zone" + default = "us-central1" +} + +variable "cluster_name" { + type = string + description = "GKE cluster name" + default = "ml-cluster" +} + +variable "cluster_labels" { + type = map(any) + description = "GKE cluster labels" + default = { + created-by = "ai-on-gke" + } +} + +variable "num_gpu_nodes" { + description = "Number of GPU nodes in the cluster" + default = 1 +} + +variable "enable_autopilot" { + type = bool + description = "Set to true to enable GKE Autopilot clusters" + default = false +} + +variable "service_account" { + type = string +} + +variable "enable_fleet" { + type = bool + default = false +} + +variable "fleet_project_id" { + type = string + default = "" +} diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/main.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/main.tf new file mode 100644 index 0000000000..0c5116642f --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/main.tf @@ -0,0 +1,174 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +provider "google" { + project = var.project_id + region = var.region +} + +provider "google-beta" { + project = var.project_id + region = var.region +} + +locals { + gateway_api_config = var.gateway_api_channel != null ? [{ channel : var.gateway_api_channel }] : [] +} + +data "google_service_account" "default" { + account_id = var.service_account +} + +# GKE cluster +resource "google_container_cluster" "ml_cluster" { + name = var.cluster_name + location = var.region + count = var.enable_autopilot == false ? 1 : 0 + remove_default_node_pool = true + initial_node_count = 1 + min_master_version = "1.31" + + deletion_protection = false + + node_config { + service_account = data.google_service_account.default.email + oauth_scopes = [ + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/service.management.readonly", + "https://www.googleapis.com/auth/servicecontrol", + "https://www.googleapis.com/auth/trace.append", + ] + } + + logging_config { + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "SCHEDULER", + "SYSTEM_COMPONENTS", + "WORKLOADS" + ] + } + + monitoring_config { + enable_components = ["SYSTEM_COMPONENTS"] + managed_prometheus { + enabled = "true" + } + } + dynamic "fleet" { + for_each = var.enable_fleet ? [1] : [] + content { + project = var.fleet_project_id + } + } + + dynamic "gateway_api_config" { + for_each = local.gateway_api_config + + content { + channel = gateway_api_config.value.channel + } + } + + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" + } + + + release_channel { + channel = "RAPID" + } + + resource_labels = var.cluster_labels + + addons_config { + gcp_filestore_csi_driver_config { + enabled = true + } + + gcs_fuse_csi_driver_config { + enabled = true + } + + gce_persistent_disk_csi_driver_config { + enabled = true + } + } +} + + +resource "google_container_node_pool" "cpu_pool" { + name = "cpu-pool" + location = var.region + count = var.enable_autopilot ? 0 : 1 + cluster = var.enable_autopilot ? null : google_container_cluster.ml_cluster[0].name + + autoscaling { + min_node_count = 1 + max_node_count = 3 + } + + management { + auto_repair = "true" + auto_upgrade = "true" + } + + node_config { + machine_type = "n1-standard-4" + service_account = data.google_service_account.default.email + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/trace.append", + "https://www.googleapis.com/auth/service.management.readonly", + "https://www.googleapis.com/auth/servicecontrol", + ] + } +} + +resource "google_container_node_pool" "tpu_pool" { + provider = google-beta + name = "tpu-pool-${count.index}" + location = var.region + node_locations = var.tpu_node_location + cluster = var.enable_autopilot == false && var.enable_tpu ? google_container_cluster.ml_cluster[0].name : null + initial_node_count = var.num_nodes + count = var.enable_autopilot == false && var.enable_tpu ? var.tpu_node_pools_number : 0 + + autoscaling { + min_node_count = "0" + max_node_count = "2" + } + + node_config { + machine_type = var.tpu_machine_type + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/trace.append", + "https://www.googleapis.com/auth/service.management.readonly", + "https://www.googleapis.com/auth/servicecontrol", + ] + spot = true + } + placement_policy { + type = "COMPACT" + tpu_topology = var.tpu_topology + } +} diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/output.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/output.tf new file mode 100644 index 0000000000..1df05e2f8b --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/output.tf @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "project_id" { + description = "GCP project id" + value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].project +} + +output "region" { + description = "GCP region" + value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].location +} + +output "cluster_name" { + description = "The name of the GKE cluster" + value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].name +} + +output "kubernetes_host" { + description = "Kubernetes cluster host" + value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].endpoint +} + +output "cluster_certificate" { + description = "Kubernetes cluster CA certificate" + value = var.enable_autopilot ? null : base64decode(resource.google_container_cluster.ml_cluster[0].master_auth[0].cluster_ca_certificate) +} diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/variables.tf b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/variables.tf new file mode 100644 index 0000000000..0a3e095926 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/modules/gke_standard/variables.tf @@ -0,0 +1,104 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "GCP project id" + default = null +} + +variable "region" { + type = string + description = "GCP project region or zone" + default = "us-central1" +} + +variable "cluster_name" { + type = string + description = "GKE cluster name" + default = "ml-cluster" +} + +variable "cluster_labels" { + type = map(any) + description = "GKE cluster labels" + default = { + created-by = "ai-on-gke" + } +} + +variable "num_nodes" { + description = "Number of GPU nodes in the cluster" + default = 0 +} + +variable "enable_autopilot" { + type = bool + description = "Set to true to enable GKE Autopilot clusters" + default = false +} + +variable "enable_tpu" { + type = bool + description = "Set to true to create TPU node pool" + default = false +} + +variable "service_account" { + type = string +} + +variable "enable_fleet" { + type = bool + default = false +} + +variable "fleet_project_id" { + type = string + default = "" +} + +variable "gateway_api_channel" { + type = string + description = "The gateway api channel of this cluster. Accepted values are `CHANNEL_STANDARD` and `CHANNEL_DISABLED`." + default = null +} + +variable "gpu_driver_version" { + type = string + description = "the NVIDIA driver version to install" + default = "DEFAULT" +} + +variable "tpu_node_location" { + type = set(string) + description = "Location for tpu nodes" + default = [] +} + +variable "tpu_machine_type" { + type = string + description = "Machine type for TPU node pool" + default = "" +} +variable "tpu_topology" { + type = string + description = "Topology for TPU node pool" + default = "1x1" +} + +variable "tpu_node_pools_number" { + description = "Number of TPU node pools. " + default = 1 +} \ No newline at end of file diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/variables.tf b/ai-ml/hotswap-hero-train-job/gke-platform/variables.tf new file mode 100644 index 0000000000..ef0ecd465e --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/variables.tf @@ -0,0 +1,92 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "GCP project id" + default = "" +} + +variable "region" { + type = string + description = "GCP project region or zone" + default = "us-central1" +} + + +variable "cluster_name" { + type = string + description = "GKE cluster name" + default = "llm-cluster" +} + + +variable "cluster_labels" { + type = map(any) + description = "GKE cluster labels" + default = { + created-by = "ai-on-gke" + } +} + +variable "enable_autopilot" { + type = bool + description = "Set to true to enable GKE Autopilot clusters" + default = false +} + +variable "enable_fleet" { + type = bool + default = false +} + +variable "fleet_project_id" { + type = string + default = "" +} + +variable "gateway_api_channel" { + type = string + description = "The gateway api channel of this cluster. Accepted values are `CHANNEL_STANDARD` and `CHANNEL_DISABLED`." + default = null +} + +variable "enable_tpu" { + type = bool + description = "Set to true to create TPU node pool" + default = false +} + +variable "tpu_node_location" { + type = set(string) + description = "Location for tpu nodes" + default = [] +} + +variable "tpu_machine_type" { + type = string + description = "Machine type for TPU node pool in standard GKE cluster." + default = "" +} + +variable "tpu_topology" { + type = string + description = "Topology for standard GKE cluster TPU node pool" + default = "1x1" +} + +variable "tpu_node_pools_number" { + description = "Number of TPU node pools in standard GKE cluster." + default = 1 +} \ No newline at end of file diff --git a/ai-ml/hotswap-hero-train-job/gke-platform/versions.tf b/ai-ml/hotswap-hero-train-job/gke-platform/versions.tf new file mode 100644 index 0000000000..43ae9616aa --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/gke-platform/versions.tf @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.1" + } + google-beta = { + source = "hashicorp/google-beta" + version = "~> 6.1" + } + } +} diff --git a/ai-ml/hotswap-hero-train-job/remove.sh b/ai-ml/hotswap-hero-train-job/remove.sh new file mode 100755 index 0000000000..36e6e3159c --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/remove.sh @@ -0,0 +1,5 @@ +#!/bin/zsh + +cd gke-platform +terraform destroy --auto-approve +cd - diff --git a/ai-ml/hotswap-hero-train-job/workloads/high-priority-job-autopilot.yaml b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job-autopilot.yaml new file mode 100644 index 0000000000..1b2c119ff4 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job-autopilot.yaml @@ -0,0 +1,66 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_aiml_hotswap_hero_train_job_workloads_high_priority_job_autopilot] +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: high-priority + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool +spec: + failurePolicy: + maxRestarts: 100 + replicatedJobs: + - name: job + replicas: 2 + template: + spec: + parallelism: 2 + completions: 4 + backoffLimit: 0 + template: + metadata: + labels: + priority: high-priority + spec: + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 + cloud.google.com/gke-spot: "true" + priorityClassName: high-prior-job + containers: + - name: jax-tpu + image: python:3.8 + ports: + - containerPort: 8471 + - containerPort: 8080 + - containerPort: 8431 + command: + - bash + - -c + - | + pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + python -c 'import jax; print("Global device count:", jax.device_count())' + sleep 60000 + resources: + requests: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 + limits: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 +# [END gke_aiml_hotswap_hero_train_job_workloads_high_priority_job_autopilot] \ No newline at end of file diff --git a/ai-ml/hotswap-hero-train-job/workloads/high-priority-job.yaml b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job.yaml new file mode 100644 index 0000000000..d8ef42ce38 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/workloads/high-priority-job.yaml @@ -0,0 +1,69 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_aiml_hotswap_hero_train_job_workloads_high_priority_job] +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: high-priority + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool +spec: + failurePolicy: + maxRestarts: 100 + replicatedJobs: + - name: job + replicas: 2 + template: + spec: + parallelism: 2 + completions: 4 + backoffLimit: 0 + template: + metadata: + labels: + priority: high-priority + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 + priorityClassName: high-prior-job + containers: + - name: jax-tpu + image: python:3.8 + ports: + - containerPort: 8471 + - containerPort: 8080 + - containerPort: 8431 + securityContext: + privileged: true + command: + - bash + - -c + - | + pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + python -c 'import jax; print("Global device count:", jax.device_count())' + sleep 60000 + resources: + requests: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 + limits: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 +# [END gke_aiml_hotswap_hero_train_job_workloads_high_priority_job] diff --git a/ai-ml/hotswap-hero-train-job/workloads/low-priority-job-autopilot.yaml b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job-autopilot.yaml new file mode 100644 index 0000000000..c3e7bf1c6a --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job-autopilot.yaml @@ -0,0 +1,67 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_aiml_hotswap_hero_train_job_workloads_low_priority_job_autopilot] +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: low-priority + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool +spec: + failurePolicy: + maxRestarts: 100 + replicatedJobs: + - name: job + replicas: 1 + template: + spec: + parallelism: 2 + completions: 4 + backoffLimit: 0 + template: + metadata: + labels: + priority: low-priority + spec: + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 + cloud.google.com/gke-spot: "true" + priorityClassName: low-prior-job + terminationGracePeriodSeconds: 0 + containers: + - name: jax-tpu + image: python:3.8 + ports: + - containerPort: 8471 + - containerPort: 8080 + - containerPort: 8431 + command: + - bash + - -c + - | + pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + python -c 'import jax; print("Global device count:", jax.device_count())' + sleep 60000 + resources: + requests: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 + limits: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 +# [END gke_aiml_hotswap_hero_train_job_workloads_low_priority_job_autopilot] diff --git a/ai-ml/hotswap-hero-train-job/workloads/low-priority-job.yaml b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job.yaml new file mode 100644 index 0000000000..98989f823b --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/workloads/low-priority-job.yaml @@ -0,0 +1,69 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# [START gke_aiml_hotswap_hero_train_job_workloads_low_priority_job] +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: low-priority + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool +spec: + failurePolicy: + maxRestarts: 100 + replicatedJobs: + - name: job + replicas: 1 + template: + spec: + parallelism: 2 + completions: 4 + backoffLimit: 0 + template: + metadata: + labels: + priority: low-priority + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 + priorityClassName: low-prior-job + terminationGracePeriodSeconds: 0 + containers: + - name: jax-tpu + image: python:3.8 + ports: + - containerPort: 8471 + - containerPort: 8080 + - containerPort: 8431 + securityContext: + privileged: true + command: + - bash + - -c + - | + pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + python -c 'import jax; print("Global device count:", jax.device_count())' + sleep 60000 + resources: + requests: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 + limits: + cpu: 10 + memory: 50Gi + google.com/tpu: 4 +# [END gke_aiml_hotswap_hero_train_job_workloads_low_priority_job] diff --git a/ai-ml/hotswap-hero-train-job/workloads/priority.yaml b/ai-ml/hotswap-hero-train-job/workloads/priority.yaml new file mode 100644 index 0000000000..19cf2475a6 --- /dev/null +++ b/ai-ml/hotswap-hero-train-job/workloads/priority.yaml @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_aiml_hotswap_hero_train_job_workloads_priority] +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high-prior-job +value: 2000000 +globalDefault: false +description: "This priority class should be used for hero pods only." +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low-prior-job +value: 1000000 +globalDefault: false +description: "This priority class should be used for low priority pods only." +# [END gke_aiml_hotswap_hero_train_job_workloads_priority] \ No newline at end of file