Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,10 @@ cscope.*

/bazel-*
*.pyc

# Helm chart dependecies cache
**/Chart.lock
**/charts/*.tgz

# Helm chart output directory
ai/ai-starter-kit/out
70 changes: 70 additions & 0 deletions ai/ai-starter-kit/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
.PHONY: check_hf_token check_OCI_target package_helm lint dep_update install install_gke start uninstall push_helm
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the usage of the make commands?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You want me to document each?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just in general in README. User can still following the current README to install via helm, so not sure when these make commands should be used.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documented in commit: 78a03d7


check_hf_token:
ifndef HF_TOKEN
$(error HF_TOKEN is not set)
endif

check_OCI_target:
ifndef OCI_HELM_TARGET
$(error OCI_HELM_TARGET is not set)
endif

package_helm:
helm package helm-chart/ai-starter-kit/ --destination out/

push_helm: check_OCI_target
helm push out/ai-starter-kit* oci://$$OCI_HELM_TARGET

lint:
helm lint helm-chart/ai-starter-kit

dep_update:
helm dependency update helm-chart/ai-starter-kit

install: check_hf_token
helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values.yaml

install_gke: check_hf_token
helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values-gke.yaml

install_gke_gpu: check_hf_token
helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values-gke-gpu.yaml

start:
mkdir -p /tmp/models-cache
minikube start --cpus 4 --memory 15000 --mount --mount-string="/tmp/models-cache:/tmp/models-cache"

start_gpu:
mkdir -p $HOME/models-cache
minikube start --driver krunkit --cpus 4 --memory 15000 --mount --mount-string="$HOME/models-cache:$HOME/models-cache"

uninstall:
helm uninstall ai-starter-kit
kubectl delete pod jupyter-user
kubectl delete pvc ai-starter-kit-jupyterhub-hub-db-dir

destroy:
minikube delete

validate_jupyterhub:
kubectl get pods; \
kubectl wait --for=condition=Ready pods -l 'component!=continuous-image-puller' --timeout=1800s; \
kubectl get pods; \
kubectl get services; \
kubectl port-forward service/ai-starter-kit-jupyterhub-proxy-public 8081:80 & \
PID=$$!; \
echo "Port-forward PID=$${PID}"; \
sleep 5s; \
python3 ./ci/test_hub.py "127.0.0.1:8081"; \
kill $$PID

validate_ray:
kubectl wait --for=condition=Ready pods -l 'app.kubernetes.io/created-by=kuberay-operator' --timeout=1800s; \
kubectl get pods; \
kubectl get services; \
kubectl port-forward service/ai-starter-kit-kuberay-head-svc 8265:8265 & \
PID=$$!; \
sleep 10s; \
ray job submit --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"; \
kill $$PID
9 changes: 9 additions & 0 deletions ai/ai-starter-kit/ci/terraform/default_env.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
project_id = ""
default_resource_name = ""

cluster_name = "" # Leave empty to use the default name (default_resource_name)
cluster_location = "us-central1"
private_cluster = false
autopilot_cluster = true

service_account_name = "" # Leave empty to use the default name
108 changes: 108 additions & 0 deletions ai/ai-starter-kit/ci/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
terraform {

required_providers {
kubectl = {
source = "gavinbunney/kubectl"
version = ">= 1.19.0"
}
}
}
data "google_client_config" "default" {}


data "google_project" "project" {
project_id = var.project_id
}


locals {
cluster_name = var.cluster_name != "" ? var.cluster_name : var.default_resource_name
}

module "gke_cluster" {
source = "github.com/ai-on-gke/common-infra/common/infrastructure?ref=main"

project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = false
network_name = "default"
subnetwork_name = "default"
enable_gpu = true
gpu_pools = [
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-balanced"
enable_gcfs = true
logging_variant = "DEFAULT"
accelerator_count = 2
accelerator_type = "nvidia-l4"
gpu_driver_version = "DEFAULT"
}
]
ray_addon_enabled = false
}

locals {
#ca_certificate = base64decode(module.gke_cluster.ca_certificate)
cluster_membership_id = var.cluster_membership_id == "" ? local.cluster_name : var.cluster_membership_id
host = var.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : "https://${module.gke_cluster.endpoint}"

}

provider "kubernetes" {
alias = "ai_starter_kit"
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = var.private_cluster ? "" : base64decode(module.gke_cluster.ca_certificate)

dynamic "exec" {
for_each = var.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}

locals {
service_account_name = var.service_account_name != "" ? var.service_account_name : var.default_resource_name
}


module "ai_starter_kit_workload_identity" {
providers = {
kubernetes = kubernetes.ai_starter_kit
}
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
name = local.service_account_name
namespace = "default"
roles = ["roles/storage.objectUser"]
project_id = var.project_id
depends_on = [module.gke_cluster]
}

provider "kubectl" {
alias = "ai_starter_kit"
apply_retry_count = 15
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = var.private_cluster ? "" : base64decode(module.gke_cluster.ca_certificate)
load_config_file = true

dynamic "exec" {
for_each = var.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}
15 changes: 15 additions & 0 deletions ai/ai-starter-kit/ci/terraform/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

output "gke_cluster_name" {
value = local.cluster_name
description = "GKE cluster name"
}

output "gke_cluster_location" {
value = var.cluster_location
description = "GKE cluster location"
}

output "project_id" {
value = var.project_id
description = "GKE cluster location"
}
26 changes: 26 additions & 0 deletions ai/ai-starter-kit/ci/terraform/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
variable "project_id" {
type = string
}
variable "default_resource_name" {
type = string
}
variable "cluster_name" {
type = string
}
variable "cluster_location" {
type = string
}
variable "autopilot_cluster" {
type = bool
}
variable "private_cluster" {
type = bool
}
variable "cluster_membership_id" {
type = string
description = "require to use connectgateway for private clusters, default: cluster_name"
default = ""
}
variable "service_account_name" {
type = string
}
59 changes: 59 additions & 0 deletions ai/ai-starter-kit/ci/test_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys
import requests
from packaging.version import Version as V


def test_hub_up(hub_url):
r = requests.get(hub_url)
r.raise_for_status()
print("JupyterHub up.")


def test_api_root(hub_url):
"""
Tests the hub api's root endpoint (/). The hub's version should be returned.

A typical jupyterhub logging response to this test:

[I 2019-09-25 12:03:12.051 JupyterHub log:174] 200 GET /hub/api ([email protected]) 9.57ms
"""
r = requests.get(hub_url + "/hub/api")
r.raise_for_status()
info = r.json()
version = info["version"]
assert V("4") <= V(version) <= V("5.5"), f"version {version} must be between 4 and 5.5"
print("JupyterHub Rest API is working.")


def test_hub_login(hub_url):
"""
Tests the hub dummy authenticator login credentials. Login credentials retrieve
from /jupyter_config/config.yaml. After successfully login, user will be
redirected to /hub/spawn.
"""
username, password = "user", "sneakypass"
session = requests.Session()

response = session.get(hub_url + "/hub/login")
response.raise_for_status()

auth_params = {}
if "_xsrf" in session.cookies:
auth_params = {"_xsrf": session.cookies["_xsrf"]}

response = session.post(
hub_url + "/hub/login",
params=auth_params,
data={"username": username, "password": password},
allow_redirects=True,
)
response.raise_for_status()
assert (hub_url + "/hub/spawn-pending/user") in response.url, f"unexpected response url: got {response.url}, expected {hub_url}/hub/spawn-pending/user"
print("JupyterHub login success.")


hub_url = "http://" + sys.argv[1]

test_hub_up(hub_url)
test_api_root(hub_url)
test_hub_login(hub_url)
Loading